# Copyright 2013-2014 Stefan Goebel.
#
# This file is part of Newcomen.
#
# Newcomen is free software: you can redistribute it and/or modify it under the terms of the GNU
# General Public License as published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Newcomen is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
# the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along with Newcomen. If not, see
# .
package Newcomen::Role::Crawler::Filesystem;
our $VERSION = 2014052501;
use namespace::autoclean;
use Moose::Role;
use Path::Class;
use Newcomen::Source;
use Newcomen::Util::Path;
sub _crawl_fs {
my $self = shift;
my $root = shift;
my $dirs = ref $_ [0] eq 'ARRAY' ? shift : [shift];
my $call = shift;
my $crwl = shift;
my @list = ();
($crwl = scalar caller) =~ s/^Newcomen::Plugin::// unless $crwl;
my $pr_cfg = $root;
my $pr_abs = Newcomen::Util::Path::resolute (Path::Class::Dir -> new ($pr_cfg));
confess "Invalid root directory: $pr_cfg" unless $root and -d $pr_abs -> stringify ();
for my $dir (@$dirs) {
my $pd_cfg = $dir;
my $pd_abs = Newcomen::Util::Path::resolute ($pr_abs -> subdir ($dir));
my $pd_rel = $pd_abs -> relative ($pr_abs);
confess "Invalid data directory: $pd_cfg" unless -d $pd_abs -> stringify ();
confess "Data directory not in root: $pd_cfg" unless $pr_abs -> contains ($pd_abs);
$pd_abs -> recurse ('callback' => sub {
my $ps_abs = Newcomen::Util::Path::resolute (shift);
my $ps_data = $ps_abs -> relative ($pd_abs) -> as_foreign ('Unix');
my $ps_root = $ps_abs -> relative ($pr_abs) -> as_foreign ('Unix');
my $ps_base = $ps_abs -> basename ();
return if $ps_abs -> stringify () eq $pd_abs -> stringify ();
return unless $pd_abs -> contains ($ps_abs);
if (ref $call eq 'Regexp') {
return unless $ps_root -> stringify () =~ $call;
}
elsif (ref $call eq 'CODE') {
return unless $call -> (
$ps_abs -> stringify (),
$ps_root -> stringify (),
$ps_data -> stringify (),
$ps_base
);
}
my $pp_abs = Newcomen::Util::Path::resolute ($ps_abs -> parent ());
my $pp_data = $pp_abs -> relative ($pd_abs) -> as_foreign ('Unix');
my $pp_root = $pp_abs -> relative ($pr_abs) -> as_foreign ('Unix');
return unless $pd_abs -> contains ($pp_abs);
my $src = Newcomen::Source -> new (
'creator' => $crwl,
'id' => $ps_root -> stringify (),
);
$src -> set (['path'], {
'root' => {
'absolute' => $pr_abs -> stringify (),
'config' => $pr_cfg,
},
'data' => {
'absolute' => $pd_abs -> stringify (),
'relative' => $pd_rel -> stringify (),
'config' => $pd_cfg,
},
'source' => {
'absolute' => $ps_abs -> stringify (),
'data' => $ps_data -> stringify (),
'root' => $ps_root -> stringify (),
'basename' => $ps_base,
},
'parent' => {
'absolute' => $pp_abs -> stringify (),
'data' => $pp_data -> stringify (),
'root' => $pp_root -> stringify (),
},
});
push @list, $src;
});
}
return @list;
}
1;
__END__
####################################################################################################
=head1 NAME
Newcomen::Role::Crawler::Filesystem - Moose role for plugins to crawl the filesystem.
=head1 SYNOPSIS
with 'Newcomen::Role::Crawler::Filesystem';
# The _crawl_fs() method will be available:
my @sources = $self -> _crawl_fs (
$root, $dirs, $match, $creator
);
# Include only files (or directories!) with .txt extension:
my @sources = $self -> _crawl_fs (
$root, $dirs, qr/\.txt$/
);
# Include only non-empty text files:
my @sources = $self -> _crawl_fs (
$root, $dirs, sub { -f -T -s $_ [0] }
);
=head1 DESCRIPTION
This role adds the L<_crawl_fs()|/_crawl_fs> method to the consuming class. This method may be used
to gather a list of data sources from the file system.
=head2 Sources
The ID of the L instances returned by the L<_crawl_fs()|/_crawl_fs> method will be
set to the item path relative to the specified root directory (using Unix path format with C<'/'> as
path separator).
The creator of the source items will be set as described below, see L<_crawl_fs()|/_crawl_fs>.
No file will actually be read, so the source items' content will not be set. Meta data will be set
as described below.
=head1 META DATA
=head2 Sources
{
'path' => {
'root' => {
'absolute' => $root_abs,
'config' => $root_cfg,
},
'data' => {
'absolute' => $data_abs,
'relative' => $data_rel,
'config' => $data_cfg,
},
'source' => {
'absolute' => $src_abs,
'data' => $src_data,
'root' => $src_root,
'basename' => $src_base,
},
'parent' => {
'absolute' => $parent_abs,
'data' => $parent_data,
'root' => $parent_root,
},
},
}
This is the meta data set for all L instances returned by the
L<_crawl_fs()|/_crawl_fs> method.
=over
=item I
Absolute path of the root directory used by L<_crawl_fs()|/_crawl_fs>.
=item I
Path of the root directory as supplied to L<_crawl_fs()|/_crawl_fs>.
=item I
Absolute path of the source item's data directory.
=item I
Path of the source item's data directory relative to the root directory used by
L<_crawl_fs()|/_crawl_fs>.
=item I
Path of the source item's data directory as supplied to L<_crawl_fs()|/_crawl_fs>.
=item I
Absolute path of the source item.
=item I
Path of the source item relative to its data directory.
=item I
Path of the source item relative to the root directory used by L<_crawl_fs()|/_crawl_fs>.
=item I
Basename of the source item.
=item I
Absolute path of the source item's parent directory.
=item I
Path of the source item's parent directory relative to its data directory.
=item I
Path of the source item's parent directory relative to the root directory used by
L<_crawl_fs()|/_crawl_fs>.
=back
Note: All relative paths will use the Unix format, i.e. the slash (C<'/'>) will be used as directory
separator. Absolute paths will use the native format. Absolute paths will be resolved, and relative
paths will be built from these resolved absolute paths. Relative paths may be or include C<'.'> or
C<'..'> elements. The information about the root directory (which is usually not the physical root
directory of a file system) will be the same for all source instances returned from one call to the
L<_crawl_fs()|/_crawl_fs> method.
=head1 INSTANCE METHODS
=head2 _crawl_fs
my @sources = $self -> _crawl_fs ($root, $dirs, $match, $creator);
Returns a list of L instances created from data on the file system.
The first parameter specifies the root directory to be used (usually the root directory of the
B project), as a relative or absolute path.
The second parameter must either be a string, or an arrayref of strings. These strings specify any
number of data directories, as paths relative to the aforementioned root directory. All data
directories will be searched recursively for source files.
The third parameter must either be a regexp (C) or a code reference. If it is a regular
expression, the path of every item found relative to the root directory (as specified by the first
parameter) will be checked against it, and all matching items will be included in the list returned.
Note that this includes not only
files, but also any matching directories or links (subject to the restrictions mentioned below). If the third parameter is a subroutine reference,
this subroutine will be called for every item found. If the subroutine returns a true value, the
source item will be included in the list returned. The subroutine is called with four parameters:
the absolute path to the current item, the path of the current item relative to the supplied root directory,
the path of the current item relative to the data directory, and the current item's basename, in that
order. All these parameters are strings. Note that relative paths will be in Unix format, i.e. using
C<'/'> as path separator. Absolute paths will be in native format.
The optional fourth parameter may be used to specify a value for the I attribute of the
L instances. If this parameter is omitted, it defaults to the caller's name, with
the C<'Newcomen::Plugin::'> prefix removed (if it is supplied it will be used unaltered).
I<_crawl_fs()> will I on any error. All supplied directories must exist and be readable. All
directories must be contained in the higher level ones, i.e. the data directories must be located
physically beneath the root directory (directly or indirectly), and all source items must be located
beneath their data directory! This must be true for the resolved paths, i.e. links may not work,
depending on their destination.
=head1 SEE ALSO
L, L, L
=head1 VERSION
This is version C<2014052501>.
=head1 AUTHOR
Stefan Goebel - newcomen {at} subtype {dot} de
=head1 COPYRIGHT AND LICENSE
Copyright 2013-2014 Stefan Goebel.
This file is part of Newcomen.
Newcomen is free software: you can redistribute it and/or modify it under the terms of the
L as published by the
L, either version 3 of the license, or (at your
option) any later version.
Newcomen is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
L for more details.
You should have received a copy of the
L along with Newcomen. If not, see
>.
=cut
####################################################################################################
# :indentSize=3:tabSize=3:noTabs=true:mode=perl:maxLineLen=100: