#!c:/perl/bin/perl use strict; use LWP::UserAgent; require LWP; use URI; use LWP::RobotUA; require LWP::RobotUA; use Globals; use Globals qw(%ALLPLATTERS); use Globals qw($MINTRAWL); use Globals qw($delim); use Globals qw(@MUSIC); use Globals qw(@VIDEO); use Globals qw(@SHORTCUT); use Globals qw($debug); use Globals qw($bgcolour); use CGI qw(:standard); $|=1; my $QUERY = CGI::new(); my $NAMELIST = $QUERY->param( "hosts" ); my $NAME; my $HOST; my $BASE; my %ABCS; my $HEADER = new HTTP::Headers( 'Accept' => 'text/plain', 'User-Agent' => 'PlatterTwaartter/3.0.0', 'Dixon' => 'Am-Hammy-Twaart' ); my $ROBOT_NAME = "PlatterTwaartter"; my $ROBOT_OWNER = "chris.dixon\@hammytwaart.com"; my $USERAGENT; my $RECURSE=0; # Stop weird parent links setting us up the bomb my $LINECOUNT = 0; my $QUICKEXT = $delim.join($delim, (@MUSIC, @VIDEO, @SHORTCUT)).$delim; # Print the header # my $TEMPLATE = openTemplate ("trawl"); $TEMPLATE =~ s/%MINTRAWL%/$MINTRAWL/g; $TEMPLATE =~ s/%BGCOLOUR%/$bgcolour/g; print $TEMPLATE; if ( defined $NAMELIST ) { # Lock the inventory against meddling by pesky kids if (-e "../inventory/trawl.lock") { print '

The Distroplat inventory is currently locked by another user. Please try again in a bit. Bye.

'; exit(0); } open (LOCK, ">../inventory/trawl.lock"); close(LOCK); # Load the ABCs %ABCS = getABCs(); #Set up the user agent $USERAGENT = new LWP::RobotUA $ROBOT_NAME, $ROBOT_OWNER; # go as fast as you can $USERAGENT->delay(0); $USERAGENT->timeout(15); # seconds - doesn't seem to work though # Deal with each server in turn my @SPLITNAME = split(',', $NAMELIST); print "

Attempting to trawl ".scalar(@SPLITNAME)." servers

\n"; foreach $NAME (@SPLITNAME) { print "

Trawling $NAME . .

"; open (INVENTORY, ">../inventory/$NAME.inv"); open (ARTISTS, ">../inventory/$NAME.artist"); # Find the host my $found=0; my @RESULT; foreach $HOST (keys %ALLPLATTERS) { if ($HOST eq $NAME) { $found = 1; $BASE = $ALLPLATTERS{$HOST}; print "Trawling ".$NAME."'s server @ http://$BASE\n"; print INVENTORY "http://$BASE\n"; my $TIME = time; @RESULT = parse_page( 'http://'.$BASE.'/', $NAME); my $INVENTORY = shift (@RESULT); if ( length $INVENTORY > 0 ) { print INVENTORY $INVENTORY; my $count = shift @RESULT; # remove duplicate artists @RESULT = sort @RESULT; my $i=0; my $last; while ($i<=$#RESULT) { if ($last eq $RESULT[$i]) { splice @RESULT, $i, 1; } else { $last = $RESULT[$i]; $i++; } } print ARTISTS join("\n", @RESULT)."\n"; print "

Indexed $count tracks in ".(time-$TIME)." seconds."; } } } close INVENTORY; close ARTISTS; unless ($found) { print "
Host $NAME is not registered platter host."; } } # Re-write the abcs file open ( ABC, ">../inventory/abcs.txt" ); my @abckeys = sort {uc($a) cmp uc($b)} keys %ABCS; my $key; foreach $key (@abckeys) { print ABC $key.$delim.$ABCS{$key}."\n"; } close ABC; print "

All done

"; unlink "../inventory/trawl.lock"; } else { # Generate form # print q{

}; my %TIMES; foreach $HOST (keys %ALLPLATTERS) { my $HOSTFILE = '../inventory/'.$HOST.".inv"; my $thistime = 0; if (-e $HOSTFILE) { my @stats = stat(_); $thistime = $stats[9]; } $TIMES{ $thistime.";".$HOST } .= $ALLPLATTERS{$HOST}."
"; } my $TIME; my $LASTMOD; foreach $TIME (sort {$b <=> $a} keys %TIMES) { my ($THISTIME, $HOST) = split (/;/, $TIME); my $URL = $TIMES{$TIME}; my $TRAWLABLE = 1; if ($TIME > 0) { $LASTMOD = localtime($TIME)." CET"; if (time - $TIME < (60*$MINTRAWL)) { $TRAWLABLE = 0; } } else { $LASTMOD = "Not trawled yet"; } print ""; print ""; print "\n"; } print qq{

	User	Last Trawled

"; if ($TRAWLABLE) { print ""; } else { print " "; } print "	".$HOST."	".$LASTMOD."

}; } exit 0; # sub to recursively trawl http directories for A tags # should store all the recognised files it can find. # # should be called with 1 parameter # - path # sub parse_page { my($URL, $NAME) = @_; my $INVENTORY = ""; my @ARTISTS = (); my $localcount = 0; $RECURSE++; if ($RECURSE < 3) { my $FOLDER; if ($RECURSE == 1) { $FOLDER = $NAME."'s platter"; } else { $URL =~ /.*\/([^\/]+)\/*$/; $FOLDER = makeNice($1); } print "
Indexing ".$FOLDER.":\n"; } #request page my $REQUEST = HTTP::Request->new('GET', $URL , $HEADER); my $RESPONSE = $USERAGENT->request($REQUEST); #if response is successful if ($RESPONSE->is_success) { #parse page that is returned require HTML::Parse; my $HTML = HTML::Parse::parse_html($RESPONSE->content); my $base = $RESPONSE->base; for ( @{ $HTML->extract_links } ) { my($LINK, $elem) = @$_; my $TAG = $elem->tag; my $LINK = URI->new($LINK)->abs($base)->as_string; #$LINK = makeUnNice($LINK); $LINK =~ s/%25/%/g; my $OUTPUT = $LINK; $OUTPUT =~ s/(.*?)$BASE//; my $EXT; if ($OUTPUT =~ /\.(\w*)$/) { $EXT = $1;} if ($TAG =~ /[Aa]/ && $URL !~ /\Q$LINK\E\/?.*/ && $LINK !~ /\?.*/) { if ($debug) { print "
$LINK\n" } if ( $QUICKEXT =~ /$delim$EXT$delim/i || $OUTPUT =~ /(artist|cover)[^\/]*\.(jpg|gif|png)$/i || $OUTPUT =~ /.+\/info\.txt$/i ) { #if the file is recognised then #add link to inventory my @entry = makeInvLine($OUTPUT); $INVENTORY .= join($delim, @entry)."$delim$OUTPUT\n"; push (@ARTISTS, $entry[0]) if $entry[0]; push (@ARTISTS, $entry[1]) if $entry[1]; if ($QUICKEXT =~ /$delim$EXT$delim/i) {$localcount++; } } elsif ($OUTPUT =~ /.abc$/i) { if ($OUTPUT =~ /([^\/]+)\/([^\/]+\.abc)$/) { my $PATH = makeNice($1); my $ARTIST = makeNice($2); $ABCS{$PATH} = $ARTIST; } } elsif ($OUTPUT =~ /^\/info\.txt$/i) { # news page my $NEWSREQUEST = HTTP::Request->new('GET', $LINK, $HEADER); my $NEWSRESPONSE = $USERAGENT->request($NEWSREQUEST); if ($NEWSRESPONSE->is_success) { my $lastmod = $NEWSRESPONSE->headers()->last_modified; open(NEWS, ">../inventory/$NAME.news"); print NEWS $lastmod."\n"; print NEWS $NEWSRESPONSE->content; close(NEWS); print "
Fetched news page from $NAME"; } else { print '
Code '.$RESPONSE->code()." error fetching news page from $NAME: ".$RESPONSE->message().''; } } else { #print "
$LINK\n"; if ($LINK =~ /\Q$URL\E/i # previous link is a substring of the new one && !($LINK=~ /\.\w{3}$/)) # do not retrieve anything with a dot and three chars at the end { if ($RECURSE < 5) # limit navigation to 5 folders deep - and that's taking the piss { my @CHILD = parse_page($LINK); $INVENTORY.= shift(@CHILD); $localcount += shift(@CHILD); push(@ARTISTS, @CHILD); } else { print "

Recursion limit hit, will not trawl child folders.
". "Please check the format of your platter and contact Platter Support

"; } } } } } } else { print '
Code '.$RESPONSE->code().' error indexing '.makeNice($URL).': '.$RESPONSE->message().''; } if ($RECURSE == 2) { print " $localcount tracks found."; } $RECURSE--; return ($INVENTORY, $localcount, @ARTISTS); } sub makeInvLine( @_ ) { my @items = split('/', $_[0]); shift @items; # remove leading slash # get artist folder my $folder; if ($#items > 0) { $folder = shift @items; $folder =~ s/^(The%20|)(.*?)(,%20The|)$/$2/i; } # get track my $track = pop(@items); # square brackets weren't seen for 6 years, but when they arrived they broke it $track =~ s/\[/%5B/g; $track =~ s/\]/%5D/g; # get track artist my $trackartist; if ($track =~ /^(\d{2}%20-%20|)(The%20|)([^\/]*?)(,%20The|)%20-%20/i && !($3 =~ /^\d{2}$/) && length $3 > 0) { $trackartist = $3; $track =~ s/(The%20|)$trackartist(,%20The|)%20\-%20(.*?)/$3/; } # get album my $album; my $year; if ($#items > -1) { $album = join('/', @items); # get year if ($album =~ /%20($|%28)(\d{4})($|%29)(\/|$)/) { $year = $2; } } return ( makeNice($folder), makeNice($trackartist), makeNice($album), $year, makeNice($track)); }

Attempting to trawl ".scalar(@SPLITNAME)." servers

Trawling $NAME . .

All done

Recursion limit hit, will not trawl child folders.". "Please check the format of your platter and contact Platter Support

Recursion limit hit, will not trawl child folders.
". "Please check the format of your platter and contact Platter Support