diff options
Diffstat (limited to 'cgi-bin/trawler.plx')
| -rw-r--r-- | cgi-bin/trawler.plx | 393 |
1 files changed, 393 insertions, 0 deletions
diff --git a/cgi-bin/trawler.plx b/cgi-bin/trawler.plx new file mode 100644 index 0000000..9d755d2 --- /dev/null +++ b/cgi-bin/trawler.plx @@ -0,0 +1,393 @@ +#!c:/perl/bin/perl + +use strict; +use LWP::UserAgent; +require LWP; +use URI; +use LWP::RobotUA; +require LWP::RobotUA; +use Globals; +use Globals qw(%ALLPLATTERS); +use Globals qw($MINTRAWL); +use Globals qw($delim); +use Globals qw(@MUSIC); +use Globals qw(@VIDEO); +use Globals qw(@SHORTCUT); +use Globals qw($debug); +use Globals qw($bgcolour); + +use CGI qw(:standard); +$|=1; + +my $QUERY = CGI::new(); +my $NAMELIST = $QUERY->param( "hosts" ); +my $NAME; +my $HOST; +my $BASE; +my %ABCS; +my $HEADER = new HTTP::Headers( 'Accept' => 'text/plain', 'User-Agent' => 'PlatterTwaartter/3.0.0', 'Dixon' => 'Am-Hammy-Twaart' ); +my $ROBOT_NAME = "PlatterTwaartter"; +my $ROBOT_OWNER = "chris.dixon\@hammytwaart.com"; +my $USERAGENT; +my $RECURSE=0; # Stop weird parent links setting us up the bomb +my $LINECOUNT = 0; +my $QUICKEXT = $delim.join($delim, (@MUSIC, @VIDEO, @SHORTCUT)).$delim; +# Print the header +# +my $TEMPLATE = openTemplate ("trawl"); +$TEMPLATE =~ s/%MINTRAWL%/$MINTRAWL/g; +$TEMPLATE =~ s/%BGCOLOUR%/$bgcolour/g; +print $TEMPLATE; + +if ( defined $NAMELIST ) { + + # Lock the inventory against meddling by pesky kids + if (-e "../inventory/trawl.lock") { + print '<p>The Distroplat inventory is currently locked by another user. Please <a href="javascript:location.reload()">try again</a> in a bit. Bye.</p>'; + exit(0); + } + + open (LOCK, ">../inventory/trawl.lock"); + close(LOCK); + + # Load the ABCs + + %ABCS = getABCs(); + + #Set up the user agent + + $USERAGENT = new LWP::RobotUA $ROBOT_NAME, $ROBOT_OWNER; + # go as fast as you can + $USERAGENT->delay(0); + $USERAGENT->timeout(15); # seconds - doesn't seem to work though + + # Deal with each server in turn + my @SPLITNAME = split(',', $NAMELIST); + + print "<h2>Attempting to trawl ".scalar(@SPLITNAME)." servers</h2>\n"; + + foreach $NAME (@SPLITNAME) { + + print "<h4>Trawling $NAME . . </h4>"; + open (INVENTORY, ">../inventory/$NAME.inv"); + open (ARTISTS, ">../inventory/$NAME.artist"); + # Find the host + my $found=0; + my @RESULT; + foreach $HOST (keys %ALLPLATTERS) { + if ($HOST eq $NAME) { + $found = 1; + + $BASE = $ALLPLATTERS{$HOST}; + + print "Trawling ".$NAME."'s server @ http://$BASE\n"; + print INVENTORY "http://$BASE\n"; + my $TIME = time; + @RESULT = parse_page( 'http://'.$BASE.'/', $NAME); + + my $INVENTORY = shift (@RESULT); + if ( length $INVENTORY > 0 ) { + print INVENTORY $INVENTORY; + my $count = shift @RESULT; + + # remove duplicate artists + @RESULT = sort @RESULT; + my $i=0; my $last; + while ($i<=$#RESULT) { + if ($last eq $RESULT[$i]) { + splice @RESULT, $i, 1; + } + else { + $last = $RESULT[$i]; + $i++; + } + } + + print ARTISTS join("\n", @RESULT)."\n"; + print "<p id=\"$NAME\">Indexed $count tracks in ".(time-$TIME)." seconds.<script>document.getElementById(\"$NAME\").scrollIntoView(false);</script>"; + + } + } + } + close INVENTORY; + close ARTISTS; + unless ($found) { + print "<br>Host $NAME is not registered platter host."; + } + } + +# Re-write the abcs file + +open ( ABC, ">../inventory/abcs.txt" ); +my @abckeys = sort {uc($a) cmp uc($b)} keys %ABCS; +my $key; +foreach $key (@abckeys) { + print ABC $key.$delim.$ABCS{$key}."\n"; + } +close ABC; + +print "<h4 id=\"end\">All done</h4><script>document.getElementById(\"end\").scrollIntoView(false);</script></body></html>"; +unlink "../inventory/trawl.lock"; +} +else +{ + # Generate form + # + print q{ <form name="twatsource"> + <table border=0 cellpadding=1 cellspacing=0> + <tr> + <td> </td> + <td><b>User</b></td> + <td><b>Last Trawled</b></td> + </tr> + <tr><td colspan=3><hr size=1 color=white></td></tr> + }; + + my %TIMES; + + foreach $HOST (keys %ALLPLATTERS) + { + my $HOSTFILE = '../inventory/'.$HOST.".inv"; + my $thistime = 0; + + if (-e $HOSTFILE) + { + my @stats = stat(_); + $thistime = $stats[9]; + } + + $TIMES{ $thistime.";".$HOST } .= $ALLPLATTERS{$HOST}."<br>"; + } + + my $TIME; + my $LASTMOD; + + foreach $TIME (sort {$b <=> $a} keys %TIMES) + { + my ($THISTIME, $HOST) = split (/;/, $TIME); + my $URL = $TIMES{$TIME}; + my $TRAWLABLE = 1; + + if ($TIME > 0) + { + $LASTMOD = localtime($TIME)." CET"; + if (time - $TIME < (60*$MINTRAWL)) + { + $TRAWLABLE = 0; + } + } + else + { + $LASTMOD = "Not trawled yet"; + } + + print "<tr><td>"; + + if ($TRAWLABLE) + { + print "<input type=\"checkbox\" name=\"".$HOST."\">"; + } + else + { + print " "; + } + print "</td>"; + print "<td nowrap valign=bottom><b>".$HOST."</b></td>"; + print "<td nowrap valign=bottom> <small>".$LASTMOD."</small></td></tr>\n"; + +} + +print qq{ </tr> + <tr><td colspan=3><hr size=1 color=white></td></tr></table> + </form> + <form><input type="button" onClick="twat()" value="Trawl"></form> + </body> + </html> + }; + +} +exit 0; + + + +# sub to recursively trawl http directories for A tags +# should store all the recognised files it can find. +# +# should be called with 1 parameter +# - path +# +sub parse_page +{ + my($URL, $NAME) = @_; + my $INVENTORY = ""; + my @ARTISTS = (); + my $localcount = 0; + $RECURSE++; + + if ($RECURSE < 3) { + my $FOLDER; + if ($RECURSE == 1) { + $FOLDER = $NAME."'s platter"; + } + else { + $URL =~ /.*\/([^\/]+)\/*$/; + $FOLDER = makeNice($1); + } + print "<br><span id=\"line".$LINECOUNT."\">Indexing <b>".$FOLDER.":</b></span><script>document.getElementById(\"line".$LINECOUNT++."\").scrollIntoView(false);</script>\n"; + } + + + #request page + my $REQUEST = HTTP::Request->new('GET', $URL , $HEADER); + my $RESPONSE = $USERAGENT->request($REQUEST); + #if response is successful + if ($RESPONSE->is_success) + { + #parse page that is returned + require HTML::Parse; + my $HTML = HTML::Parse::parse_html($RESPONSE->content); + my $base = $RESPONSE->base; + for ( @{ $HTML->extract_links } ) + { + my($LINK, $elem) = @$_; + my $TAG = $elem->tag; + + my $LINK = URI->new($LINK)->abs($base)->as_string; + #$LINK = makeUnNice($LINK); + $LINK =~ s/%25/%/g; + my $OUTPUT = $LINK; + $OUTPUT =~ s/(.*?)$BASE//; + my $EXT; + if ($OUTPUT =~ /\.(\w*)$/) { $EXT = $1;} + + if ($TAG =~ /[Aa]/ && $URL !~ /\Q$LINK\E\/?.*/ && $LINK !~ /\?.*/) + { + if ($debug) { print "<br>$LINK\n" } + if ( + $QUICKEXT =~ /$delim$EXT$delim/i + || + $OUTPUT =~ /(artist|cover)[^\/]*\.(jpg|gif|png)$/i + || + $OUTPUT =~ /.+\/info\.txt$/i + ) + { + #if the file is recognised then + + #add link to inventory + my @entry = makeInvLine($OUTPUT); + + $INVENTORY .= join($delim, @entry)."$delim$OUTPUT\n"; + push (@ARTISTS, $entry[0]) if $entry[0]; + push (@ARTISTS, $entry[1]) if $entry[1]; + + if ($QUICKEXT =~ /$delim$EXT$delim/i) {$localcount++; } + } + elsif ($OUTPUT =~ /.abc$/i) + { + if ($OUTPUT =~ /([^\/]+)\/([^\/]+\.abc)$/) + { + my $PATH = makeNice($1); my $ARTIST = makeNice($2); + $ABCS{$PATH} = $ARTIST; + } + } + elsif ($OUTPUT =~ /^\/info\.txt$/i) + { # news page + my $NEWSREQUEST = HTTP::Request->new('GET', $LINK, $HEADER); + my $NEWSRESPONSE = $USERAGENT->request($NEWSREQUEST); + if ($NEWSRESPONSE->is_success) + { + my $lastmod = $NEWSRESPONSE->headers()->last_modified; + open(NEWS, ">../inventory/$NAME.news"); + print NEWS $lastmod."\n"; + print NEWS $NEWSRESPONSE->content; + close(NEWS); + print "<br>Fetched news page from $NAME"; + } + else + { + print '<br><b style="color: red">Code '.$RESPONSE->code()." error fetching news page from $NAME: ".$RESPONSE->message().'</b>'; + } + } + else + { + #print "<br>$LINK\n"; + if ($LINK =~ /\Q$URL\E/i # previous link is a substring of the new one + && !($LINK=~ /\.\w{3}$/)) # do not retrieve anything with a dot and three chars at the end + { + if ($RECURSE < 5) # limit navigation to 5 folders deep - and that's taking the piss + { + my @CHILD = parse_page($LINK); + $INVENTORY.= shift(@CHILD); + $localcount += shift(@CHILD); + push(@ARTISTS, @CHILD); + } + else + { + print "<h3>Recursion limit hit, will not trawl child folders.<br>". + "Please check the format of your platter and contact Platter Support</h3>"; + } + + } + } + } + } + } + else { + print '<br><b style="color: red">Code '.$RESPONSE->code().' error indexing '.makeNice($URL).': '.$RESPONSE->message().'</b>'; } + + if ($RECURSE == 2) { + print " $localcount tracks found."; + } + $RECURSE--; + return ($INVENTORY, $localcount, @ARTISTS); +} + +sub makeInvLine( @_ ) { + my @items = split('/', $_[0]); + shift @items; # remove leading slash + + # get artist folder + my $folder; + if ($#items > 0) { + $folder = shift @items; + $folder =~ s/^(The%20|)(.*?)(,%20The|)$/$2/i; + } + + # get track + + my $track = pop(@items); + + # square brackets weren't seen for 6 years, but when they arrived they broke it + $track =~ s/\[/%5B/g; + $track =~ s/\]/%5D/g; + + + # get track artist + my $trackartist; + if ($track =~ /^(\d{2}%20-%20|)(The%20|)([^\/]*?)(,%20The|)%20-%20/i + && !($3 =~ /^\d{2}$/) + && length $3 > 0) { + $trackartist = $3; + $track =~ s/(The%20|)$trackartist(,%20The|)%20\-%20(.*?)/$3/; + } + + + # get album + my $album; my $year; + if ($#items > -1) { + $album = join('/', @items); + + # get year + if ($album =~ /%20(\(|%28)(\d{4})(\)|%29)(\/|$)/) { + $year = $2; + } + } + + return ( + makeNice($folder), + makeNice($trackartist), + makeNice($album), + $year, + makeNice($track)); + +} |
