summaryrefslogtreecommitdiff
path: root/cgi-bin/trawler.plx
diff options
context:
space:
mode:
Diffstat (limited to 'cgi-bin/trawler.plx')
-rw-r--r--cgi-bin/trawler.plx393
1 files changed, 393 insertions, 0 deletions
diff --git a/cgi-bin/trawler.plx b/cgi-bin/trawler.plx
new file mode 100644
index 0000000..9d755d2
--- /dev/null
+++ b/cgi-bin/trawler.plx
@@ -0,0 +1,393 @@
+#!c:/perl/bin/perl
+
+use strict;
+use LWP::UserAgent;
+require LWP;
+use URI;
+use LWP::RobotUA;
+require LWP::RobotUA;
+use Globals;
+use Globals qw(%ALLPLATTERS);
+use Globals qw($MINTRAWL);
+use Globals qw($delim);
+use Globals qw(@MUSIC);
+use Globals qw(@VIDEO);
+use Globals qw(@SHORTCUT);
+use Globals qw($debug);
+use Globals qw($bgcolour);
+
+use CGI qw(:standard);
+$|=1;
+
+my $QUERY = CGI::new();
+my $NAMELIST = $QUERY->param( "hosts" );
+my $NAME;
+my $HOST;
+my $BASE;
+my %ABCS;
+my $HEADER = new HTTP::Headers( 'Accept' => 'text/plain', 'User-Agent' => 'PlatterTwaartter/3.0.0', 'Dixon' => 'Am-Hammy-Twaart' );
+my $ROBOT_NAME = "PlatterTwaartter";
+my $ROBOT_OWNER = "chris.dixon\@hammytwaart.com";
+my $USERAGENT;
+my $RECURSE=0; # Stop weird parent links setting us up the bomb
+my $LINECOUNT = 0;
+my $QUICKEXT = $delim.join($delim, (@MUSIC, @VIDEO, @SHORTCUT)).$delim;
+# Print the header
+#
+my $TEMPLATE = openTemplate ("trawl");
+$TEMPLATE =~ s/%MINTRAWL%/$MINTRAWL/g;
+$TEMPLATE =~ s/%BGCOLOUR%/$bgcolour/g;
+print $TEMPLATE;
+
+if ( defined $NAMELIST ) {
+
+ # Lock the inventory against meddling by pesky kids
+ if (-e "../inventory/trawl.lock") {
+ print '<p>The Distroplat inventory is currently locked by another user. Please <a href="javascript:location.reload()">try again</a> in a bit. Bye.</p>';
+ exit(0);
+ }
+
+ open (LOCK, ">../inventory/trawl.lock");
+ close(LOCK);
+
+ # Load the ABCs
+
+ %ABCS = getABCs();
+
+ #Set up the user agent
+
+ $USERAGENT = new LWP::RobotUA $ROBOT_NAME, $ROBOT_OWNER;
+ # go as fast as you can
+ $USERAGENT->delay(0);
+ $USERAGENT->timeout(15); # seconds - doesn't seem to work though
+
+ # Deal with each server in turn
+ my @SPLITNAME = split(',', $NAMELIST);
+
+ print "<h2>Attempting to trawl ".scalar(@SPLITNAME)." servers</h2>\n";
+
+ foreach $NAME (@SPLITNAME) {
+
+ print "<h4>Trawling $NAME . . </h4>";
+ open (INVENTORY, ">../inventory/$NAME.inv");
+ open (ARTISTS, ">../inventory/$NAME.artist");
+ # Find the host
+ my $found=0;
+ my @RESULT;
+ foreach $HOST (keys %ALLPLATTERS) {
+ if ($HOST eq $NAME) {
+ $found = 1;
+
+ $BASE = $ALLPLATTERS{$HOST};
+
+ print "Trawling ".$NAME."'s server @ http://$BASE\n";
+ print INVENTORY "http://$BASE\n";
+ my $TIME = time;
+ @RESULT = parse_page( 'http://'.$BASE.'/', $NAME);
+
+ my $INVENTORY = shift (@RESULT);
+ if ( length $INVENTORY > 0 ) {
+ print INVENTORY $INVENTORY;
+ my $count = shift @RESULT;
+
+ # remove duplicate artists
+ @RESULT = sort @RESULT;
+ my $i=0; my $last;
+ while ($i<=$#RESULT) {
+ if ($last eq $RESULT[$i]) {
+ splice @RESULT, $i, 1;
+ }
+ else {
+ $last = $RESULT[$i];
+ $i++;
+ }
+ }
+
+ print ARTISTS join("\n", @RESULT)."\n";
+ print "<p id=\"$NAME\">Indexed $count tracks in ".(time-$TIME)." seconds.<script>document.getElementById(\"$NAME\").scrollIntoView(false);</script>";
+
+ }
+ }
+ }
+ close INVENTORY;
+ close ARTISTS;
+ unless ($found) {
+ print "<br>Host $NAME is not registered platter host.";
+ }
+ }
+
+# Re-write the abcs file
+
+open ( ABC, ">../inventory/abcs.txt" );
+my @abckeys = sort {uc($a) cmp uc($b)} keys %ABCS;
+my $key;
+foreach $key (@abckeys) {
+ print ABC $key.$delim.$ABCS{$key}."\n";
+ }
+close ABC;
+
+print "<h4 id=\"end\">All done</h4><script>document.getElementById(\"end\").scrollIntoView(false);</script></body></html>";
+unlink "../inventory/trawl.lock";
+}
+else
+{
+ # Generate form
+ #
+ print q{ <form name="twatsource">
+ <table border=0 cellpadding=1 cellspacing=0>
+ <tr>
+ <td>&nbsp;</td>
+ <td><b>User</b></td>
+ <td><b>Last Trawled</b></td>
+ </tr>
+ <tr><td colspan=3><hr size=1 color=white></td></tr>
+ };
+
+ my %TIMES;
+
+ foreach $HOST (keys %ALLPLATTERS)
+ {
+ my $HOSTFILE = '../inventory/'.$HOST.".inv";
+ my $thistime = 0;
+
+ if (-e $HOSTFILE)
+ {
+ my @stats = stat(_);
+ $thistime = $stats[9];
+ }
+
+ $TIMES{ $thistime.";".$HOST } .= $ALLPLATTERS{$HOST}."<br>";
+ }
+
+ my $TIME;
+ my $LASTMOD;
+
+ foreach $TIME (sort {$b <=> $a} keys %TIMES)
+ {
+ my ($THISTIME, $HOST) = split (/;/, $TIME);
+ my $URL = $TIMES{$TIME};
+ my $TRAWLABLE = 1;
+
+ if ($TIME > 0)
+ {
+ $LASTMOD = localtime($TIME)." CET";
+ if (time - $TIME < (60*$MINTRAWL))
+ {
+ $TRAWLABLE = 0;
+ }
+ }
+ else
+ {
+ $LASTMOD = "Not trawled yet";
+ }
+
+ print "<tr><td>";
+
+ if ($TRAWLABLE)
+ {
+ print "<input type=\"checkbox\" name=\"".$HOST."\">";
+ }
+ else
+ {
+ print "&nbsp;";
+ }
+ print "</td>";
+ print "<td nowrap valign=bottom><b>".$HOST."</b></td>";
+ print "<td nowrap valign=bottom>&nbsp;<small>".$LASTMOD."</small></td></tr>\n";
+
+}
+
+print qq{ </tr>
+ <tr><td colspan=3><hr size=1 color=white></td></tr></table>
+ </form>
+ <form><input type="button" onClick="twat()" value="Trawl"></form>
+ </body>
+ </html>
+ };
+
+}
+exit 0;
+
+
+
+# sub to recursively trawl http directories for A tags
+# should store all the recognised files it can find.
+#
+# should be called with 1 parameter
+# - path
+#
+sub parse_page
+{
+ my($URL, $NAME) = @_;
+ my $INVENTORY = "";
+ my @ARTISTS = ();
+ my $localcount = 0;
+ $RECURSE++;
+
+ if ($RECURSE < 3) {
+ my $FOLDER;
+ if ($RECURSE == 1) {
+ $FOLDER = $NAME."'s platter";
+ }
+ else {
+ $URL =~ /.*\/([^\/]+)\/*$/;
+ $FOLDER = makeNice($1);
+ }
+ print "<br><span id=\"line".$LINECOUNT."\">Indexing <b>".$FOLDER.":</b></span><script>document.getElementById(\"line".$LINECOUNT++."\").scrollIntoView(false);</script>\n";
+ }
+
+
+ #request page
+ my $REQUEST = HTTP::Request->new('GET', $URL , $HEADER);
+ my $RESPONSE = $USERAGENT->request($REQUEST);
+ #if response is successful
+ if ($RESPONSE->is_success)
+ {
+ #parse page that is returned
+ require HTML::Parse;
+ my $HTML = HTML::Parse::parse_html($RESPONSE->content);
+ my $base = $RESPONSE->base;
+ for ( @{ $HTML->extract_links } )
+ {
+ my($LINK, $elem) = @$_;
+ my $TAG = $elem->tag;
+
+ my $LINK = URI->new($LINK)->abs($base)->as_string;
+ #$LINK = makeUnNice($LINK);
+ $LINK =~ s/%25/%/g;
+ my $OUTPUT = $LINK;
+ $OUTPUT =~ s/(.*?)$BASE//;
+ my $EXT;
+ if ($OUTPUT =~ /\.(\w*)$/) { $EXT = $1;}
+
+ if ($TAG =~ /[Aa]/ && $URL !~ /\Q$LINK\E\/?.*/ && $LINK !~ /\?.*/)
+ {
+ if ($debug) { print "<br>$LINK\n" }
+ if (
+ $QUICKEXT =~ /$delim$EXT$delim/i
+ ||
+ $OUTPUT =~ /(artist|cover)[^\/]*\.(jpg|gif|png)$/i
+ ||
+ $OUTPUT =~ /.+\/info\.txt$/i
+ )
+ {
+ #if the file is recognised then
+
+ #add link to inventory
+ my @entry = makeInvLine($OUTPUT);
+
+ $INVENTORY .= join($delim, @entry)."$delim$OUTPUT\n";
+ push (@ARTISTS, $entry[0]) if $entry[0];
+ push (@ARTISTS, $entry[1]) if $entry[1];
+
+ if ($QUICKEXT =~ /$delim$EXT$delim/i) {$localcount++; }
+ }
+ elsif ($OUTPUT =~ /.abc$/i)
+ {
+ if ($OUTPUT =~ /([^\/]+)\/([^\/]+\.abc)$/)
+ {
+ my $PATH = makeNice($1); my $ARTIST = makeNice($2);
+ $ABCS{$PATH} = $ARTIST;
+ }
+ }
+ elsif ($OUTPUT =~ /^\/info\.txt$/i)
+ { # news page
+ my $NEWSREQUEST = HTTP::Request->new('GET', $LINK, $HEADER);
+ my $NEWSRESPONSE = $USERAGENT->request($NEWSREQUEST);
+ if ($NEWSRESPONSE->is_success)
+ {
+ my $lastmod = $NEWSRESPONSE->headers()->last_modified;
+ open(NEWS, ">../inventory/$NAME.news");
+ print NEWS $lastmod."\n";
+ print NEWS $NEWSRESPONSE->content;
+ close(NEWS);
+ print "<br>Fetched news page from $NAME";
+ }
+ else
+ {
+ print '<br><b style="color: red">Code '.$RESPONSE->code()." error fetching news page from $NAME: ".$RESPONSE->message().'</b>';
+ }
+ }
+ else
+ {
+ #print "<br>$LINK\n";
+ if ($LINK =~ /\Q$URL\E/i # previous link is a substring of the new one
+ && !($LINK=~ /\.\w{3}$/)) # do not retrieve anything with a dot and three chars at the end
+ {
+ if ($RECURSE < 5) # limit navigation to 5 folders deep - and that's taking the piss
+ {
+ my @CHILD = parse_page($LINK);
+ $INVENTORY.= shift(@CHILD);
+ $localcount += shift(@CHILD);
+ push(@ARTISTS, @CHILD);
+ }
+ else
+ {
+ print "<h3>Recursion limit hit, will not trawl child folders.<br>".
+ "Please check the format of your platter and contact Platter Support</h3>";
+ }
+
+ }
+ }
+ }
+ }
+ }
+ else {
+ print '<br><b style="color: red">Code '.$RESPONSE->code().' error indexing '.makeNice($URL).': '.$RESPONSE->message().'</b>'; }
+
+ if ($RECURSE == 2) {
+ print " $localcount tracks found.";
+ }
+ $RECURSE--;
+ return ($INVENTORY, $localcount, @ARTISTS);
+}
+
+sub makeInvLine( @_ ) {
+ my @items = split('/', $_[0]);
+ shift @items; # remove leading slash
+
+ # get artist folder
+ my $folder;
+ if ($#items > 0) {
+ $folder = shift @items;
+ $folder =~ s/^(The%20|)(.*?)(,%20The|)$/$2/i;
+ }
+
+ # get track
+
+ my $track = pop(@items);
+
+ # square brackets weren't seen for 6 years, but when they arrived they broke it
+ $track =~ s/\[/%5B/g;
+ $track =~ s/\]/%5D/g;
+
+
+ # get track artist
+ my $trackartist;
+ if ($track =~ /^(\d{2}%20-%20|)(The%20|)([^\/]*?)(,%20The|)%20-%20/i
+ && !($3 =~ /^\d{2}$/)
+ && length $3 > 0) {
+ $trackartist = $3;
+ $track =~ s/(The%20|)$trackartist(,%20The|)%20\-%20(.*?)/$3/;
+ }
+
+
+ # get album
+ my $album; my $year;
+ if ($#items > -1) {
+ $album = join('/', @items);
+
+ # get year
+ if ($album =~ /%20(\(|%28)(\d{4})(\)|%29)(\/|$)/) {
+ $year = $2;
+ }
+ }
+
+ return (
+ makeNice($folder),
+ makeNice($trackartist),
+ makeNice($album),
+ $year,
+ makeNice($track));
+
+}