#!c:/perl/bin/perl use strict; use LWP::UserAgent; require LWP; use URI; use LWP::RobotUA; require LWP::RobotUA; use Globals; use Globals qw(%ALLPLATTERS); use Globals qw($MINTRAWL); use Globals qw($delim); use Globals qw(@MUSIC); use Globals qw(@VIDEO); use Globals qw(@SHORTCUT); use Globals qw($debug); use Globals qw($bgcolour); use CGI qw(:standard); $|=1; my $QUERY = CGI::new(); my $NAMELIST = $QUERY->param( "hosts" ); my $NAME; my $HOST; my $BASE; my %ABCS; my $HEADER = new HTTP::Headers( 'Accept' => 'text/plain', 'User-Agent' => 'PlatterTwaartter/3.0.0', 'Dixon' => 'Am-Hammy-Twaart' ); my $ROBOT_NAME = "PlatterTwaartter"; my $ROBOT_OWNER = "chris.dixon\@hammytwaart.com"; my $USERAGENT; my $RECURSE=0; # Stop weird parent links setting us up the bomb my $LINECOUNT = 0; my $QUICKEXT = $delim.join($delim, (@MUSIC, @VIDEO, @SHORTCUT)).$delim; # Print the header # my $TEMPLATE = openTemplate ("trawl"); $TEMPLATE =~ s/%MINTRAWL%/$MINTRAWL/g; $TEMPLATE =~ s/%BGCOLOUR%/$bgcolour/g; print $TEMPLATE; if ( defined $NAMELIST ) { # Lock the inventory against meddling by pesky kids if (-e "../inventory/trawl.lock") { print '

The Distroplat inventory is currently locked by another user. Please try again in a bit. Bye.

'; exit(0); } open (LOCK, ">../inventory/trawl.lock"); close(LOCK); # Load the ABCs %ABCS = getABCs(); #Set up the user agent $USERAGENT = new LWP::RobotUA $ROBOT_NAME, $ROBOT_OWNER; # go as fast as you can $USERAGENT->delay(0); $USERAGENT->timeout(15); # seconds - doesn't seem to work though # Deal with each server in turn my @SPLITNAME = split(',', $NAMELIST); print "

Attempting to trawl ".scalar(@SPLITNAME)." servers

\n"; foreach $NAME (@SPLITNAME) { print "

Trawling $NAME . .

"; open (INVENTORY, ">../inventory/$NAME.inv"); open (ARTISTS, ">../inventory/$NAME.artist"); # Find the host my $found=0; my @RESULT; foreach $HOST (keys %ALLPLATTERS) { if ($HOST eq $NAME) { $found = 1; $BASE = $ALLPLATTERS{$HOST}; print "Trawling ".$NAME."'s server @ http://$BASE\n"; print INVENTORY "http://$BASE\n"; my $TIME = time; @RESULT = parse_page( 'http://'.$BASE.'/', $NAME); my $INVENTORY = shift (@RESULT); if ( length $INVENTORY > 0 ) { print INVENTORY $INVENTORY; my $count = shift @RESULT; # remove duplicate artists @RESULT = sort @RESULT; my $i=0; my $last; while ($i<=$#RESULT) { if ($last eq $RESULT[$i]) { splice @RESULT, $i, 1; } else { $last = $RESULT[$i]; $i++; } } print ARTISTS join("\n", @RESULT)."\n"; print "

Indexed $count tracks in ".(time-$TIME)." seconds."; } } } close INVENTORY; close ARTISTS; unless ($found) { print "
Host $NAME is not registered platter host."; } } # Re-write the abcs file open ( ABC, ">../inventory/abcs.txt" ); my @abckeys = sort {uc($a) cmp uc($b)} keys %ABCS; my $key; foreach $key (@abckeys) { print ABC $key.$delim.$ABCS{$key}."\n"; } close ABC; print "

All done

"; unlink "../inventory/trawl.lock"; } else { # Generate form # print q{
}; my %TIMES; foreach $HOST (keys %ALLPLATTERS) { my $HOSTFILE = '../inventory/'.$HOST.".inv"; my $thistime = 0; if (-e $HOSTFILE) { my @stats = stat(_); $thistime = $stats[9]; } $TIMES{ $thistime.";".$HOST } .= $ALLPLATTERS{$HOST}."
"; } my $TIME; my $LASTMOD; foreach $TIME (sort {$b <=> $a} keys %TIMES) { my ($THISTIME, $HOST) = split (/;/, $TIME); my $URL = $TIMES{$TIME}; my $TRAWLABLE = 1; if ($TIME > 0) { $LASTMOD = localtime($TIME)." CET"; if (time - $TIME < (60*$MINTRAWL)) { $TRAWLABLE = 0; } } else { $LASTMOD = "Not trawled yet"; } print ""; print ""; print "\n"; } print qq{
  User Last Trawled

"; if ($TRAWLABLE) { print ""; } else { print " "; } print "".$HOST." ".$LASTMOD."

}; } exit 0; # sub to recursively trawl http directories for A tags # should store all the recognised files it can find. # # should be called with 1 parameter # - path # sub parse_page { my($URL, $NAME) = @_; my $INVENTORY = ""; my @ARTISTS = (); my $localcount = 0; $RECURSE++; if ($RECURSE < 3) { my $FOLDER; if ($RECURSE == 1) { $FOLDER = $NAME."'s platter"; } else { $URL =~ /.*\/([^\/]+)\/*$/; $FOLDER = makeNice($1); } print "
Indexing ".$FOLDER.":\n"; } #request page my $REQUEST = HTTP::Request->new('GET', $URL , $HEADER); my $RESPONSE = $USERAGENT->request($REQUEST); #if response is successful if ($RESPONSE->is_success) { #parse page that is returned require HTML::Parse; my $HTML = HTML::Parse::parse_html($RESPONSE->content); my $base = $RESPONSE->base; for ( @{ $HTML->extract_links } ) { my($LINK, $elem) = @$_; my $TAG = $elem->tag; my $LINK = URI->new($LINK)->abs($base)->as_string; #$LINK = makeUnNice($LINK); $LINK =~ s/%25/%/g; my $OUTPUT = $LINK; $OUTPUT =~ s/(.*?)$BASE//; my $EXT; if ($OUTPUT =~ /\.(\w*)$/) { $EXT = $1;} if ($TAG =~ /[Aa]/ && $URL !~ /\Q$LINK\E\/?.*/ && $LINK !~ /\?.*/) { if ($debug) { print "
$LINK\n" } if ( $QUICKEXT =~ /$delim$EXT$delim/i || $OUTPUT =~ /(artist|cover)[^\/]*\.(jpg|gif|png)$/i || $OUTPUT =~ /.+\/info\.txt$/i ) { #if the file is recognised then #add link to inventory my @entry = makeInvLine($OUTPUT); $INVENTORY .= join($delim, @entry)."$delim$OUTPUT\n"; push (@ARTISTS, $entry[0]) if $entry[0]; push (@ARTISTS, $entry[1]) if $entry[1]; if ($QUICKEXT =~ /$delim$EXT$delim/i) {$localcount++; } } elsif ($OUTPUT =~ /.abc$/i) { if ($OUTPUT =~ /([^\/]+)\/([^\/]+\.abc)$/) { my $PATH = makeNice($1); my $ARTIST = makeNice($2); $ABCS{$PATH} = $ARTIST; } } elsif ($OUTPUT =~ /^\/info\.txt$/i) { # news page my $NEWSREQUEST = HTTP::Request->new('GET', $LINK, $HEADER); my $NEWSRESPONSE = $USERAGENT->request($NEWSREQUEST); if ($NEWSRESPONSE->is_success) { my $lastmod = $NEWSRESPONSE->headers()->last_modified; open(NEWS, ">../inventory/$NAME.news"); print NEWS $lastmod."\n"; print NEWS $NEWSRESPONSE->content; close(NEWS); print "
Fetched news page from $NAME"; } else { print '
Code '.$RESPONSE->code()." error fetching news page from $NAME: ".$RESPONSE->message().''; } } else { #print "
$LINK\n"; if ($LINK =~ /\Q$URL\E/i # previous link is a substring of the new one && !($LINK=~ /\.\w{3}$/)) # do not retrieve anything with a dot and three chars at the end { if ($RECURSE < 5) # limit navigation to 5 folders deep - and that's taking the piss { my @CHILD = parse_page($LINK); $INVENTORY.= shift(@CHILD); $localcount += shift(@CHILD); push(@ARTISTS, @CHILD); } else { print "

Recursion limit hit, will not trawl child folders.
". "Please check the format of your platter and contact Platter Support

"; } } } } } } else { print '
Code '.$RESPONSE->code().' error indexing '.makeNice($URL).': '.$RESPONSE->message().''; } if ($RECURSE == 2) { print " $localcount tracks found."; } $RECURSE--; return ($INVENTORY, $localcount, @ARTISTS); } sub makeInvLine( @_ ) { my @items = split('/', $_[0]); shift @items; # remove leading slash # get artist folder my $folder; if ($#items > 0) { $folder = shift @items; $folder =~ s/^(The%20|)(.*?)(,%20The|)$/$2/i; } # get track my $track = pop(@items); # square brackets weren't seen for 6 years, but when they arrived they broke it $track =~ s/\[/%5B/g; $track =~ s/\]/%5D/g; # get track artist my $trackartist; if ($track =~ /^(\d{2}%20-%20|)(The%20|)([^\/]*?)(,%20The|)%20-%20/i && !($3 =~ /^\d{2}$/) && length $3 > 0) { $trackartist = $3; $track =~ s/(The%20|)$trackartist(,%20The|)%20\-%20(.*?)/$3/; } # get album my $album; my $year; if ($#items > -1) { $album = join('/', @items); # get year if ($album =~ /%20(\(|%28)(\d{4})(\)|%29)(\/|$)/) { $year = $2; } } return ( makeNice($folder), makeNice($trackartist), makeNice($album), $year, makeNice($track)); }