1 files changed, 393 insertions, 0 deletions
diff --git a/cgi-bin/trawler.plx b/cgi-bin/trawler.plx
new file mode 100644
index 0000000..9d755d2
--- /dev/null
+++ b/cgi-bin/trawler.plx
@@ -0,0 +1,393 @@
+#!c:/perl/bin/perl
+
+use strict;
+use LWP::UserAgent;
+require LWP;
+use URI;
+use LWP::RobotUA;
+require LWP::RobotUA;
+use Globals;
+use Globals qw(%ALLPLATTERS);
+use Globals qw($MINTRAWL);
+use Globals qw($delim);
+use Globals qw(@MUSIC);
+use Globals qw(@VIDEO);
+use Globals qw(@SHORTCUT);
+use Globals qw($debug);
+use Globals qw($bgcolour);
+
+use CGI qw(:standard);
+$|=1;
+
+my $QUERY     = CGI::new();
+my $NAMELIST    = $QUERY->param( "hosts" );
+my $NAME;
+my $HOST;
+my $BASE;
+my %ABCS;
+my $HEADER = new HTTP::Headers( 'Accept' => 'text/plain', 'User-Agent' => 'PlatterTwaartter/3.0.0', 'Dixon' => 'Am-Hammy-Twaart' );
+my $ROBOT_NAME = "PlatterTwaartter";
+my $ROBOT_OWNER = "chris.dixon\@hammytwaart.com";
+my $USERAGENT;
+my $RECURSE=0; # Stop weird parent links setting us up the bomb
+my $LINECOUNT = 0;
+my $QUICKEXT = $delim.join($delim, (@MUSIC, @VIDEO, @SHORTCUT)).$delim;
+# Print the header
+#
+my $TEMPLATE = openTemplate ("trawl");
+$TEMPLATE =~ s/%MINTRAWL%/$MINTRAWL/g;
+$TEMPLATE =~ s/%BGCOLOUR%/$bgcolour/g;
+print $TEMPLATE;
+
+if ( defined $NAMELIST ) {
+
+  # Lock the inventory against meddling by pesky kids
+  if (-e "../inventory/trawl.lock") {
+    print '<p>The Distroplat inventory is currently locked by another user. Please <a href="javascript:location.reload()">try again</a> in a bit. Bye.</p>';
+    exit(0);
+    }
+
+  open (LOCK, ">../inventory/trawl.lock");
+  close(LOCK);
+
+  # Load the ABCs
+
+  %ABCS = getABCs();
+
+  #Set up the user agent
+
+  $USERAGENT = new LWP::RobotUA $ROBOT_NAME, $ROBOT_OWNER;
+  # go as fast as you can
+  $USERAGENT->delay(0);
+  $USERAGENT->timeout(15); # seconds - doesn't seem to work though
+
+  # Deal with each server in turn
+  my @SPLITNAME = split(',', $NAMELIST);
+
+  print "<h2>Attempting to trawl ".scalar(@SPLITNAME)." servers</h2>\n";
+
+  foreach $NAME (@SPLITNAME) {
+
+    print "<h4>Trawling $NAME  . . </h4>";
+    open (INVENTORY, ">../inventory/$NAME.inv");
+    open (ARTISTS, ">../inventory/$NAME.artist");
+    # Find the host
+    my $found=0;
+    my @RESULT;
+    foreach $HOST (keys %ALLPLATTERS) {
+      if ($HOST eq $NAME) {
+        $found = 1;
+
+        $BASE = $ALLPLATTERS{$HOST};
+
+        print "Trawling ".$NAME."'s server @ http://$BASE\n";
+        print INVENTORY "http://$BASE\n";
+        my $TIME = time;
+        @RESULT = parse_page( 'http://'.$BASE.'/', $NAME);
+
+        my $INVENTORY = shift (@RESULT);
+        if ( length $INVENTORY > 0 ) {
+          print INVENTORY $INVENTORY;
+          my $count = shift @RESULT;
+
+          # remove duplicate artists
+          @RESULT = sort @RESULT;
+          my $i=0; my $last;
+          while ($i<=$#RESULT) {
+              if ($last eq $RESULT[$i]) {
+                  splice @RESULT, $i, 1;
+              }
+              else {
+                  $last = $RESULT[$i];
+                  $i++;
+              }
+          }
+
+          print ARTISTS join("\n", @RESULT)."\n";
+          print "<p id=\"$NAME\">Indexed $count tracks in ".(time-$TIME)." seconds.<script>document.getElementById(\"$NAME\").scrollIntoView(false);</script>";
+
+        }
+      }
+    }
+    close INVENTORY;
+    close ARTISTS;
+    unless ($found) {
+      print "<br>Host $NAME is not registered platter host.";
+    }
+  }
+
+# Re-write the abcs file
+
+open ( ABC, ">../inventory/abcs.txt" );
+my @abckeys = sort {uc($a) cmp uc($b)} keys %ABCS;
+my $key;
+foreach $key (@abckeys) {
+    print ABC $key.$delim.$ABCS{$key}."\n";
+    }
+close ABC;
+
+print "<h4 id=\"end\">All done</h4><script>document.getElementById(\"end\").scrollIntoView(false);</script></body></html>";
+unlink "../inventory/trawl.lock";
+}
+else
+{
+  # Generate form
+  #
+  print q{ <form name="twatsource">
+          <table border=0 cellpadding=1 cellspacing=0>
+            <tr>
+              <td>&nbsp;</td>
+              <td><b>User</b></td>
+              <td><b>Last Trawled</b></td>
+            </tr>
+            <tr><td colspan=3><hr size=1 color=white></td></tr>
+          };
+
+  my %TIMES;
+
+  foreach $HOST (keys %ALLPLATTERS)
+  {
+    my $HOSTFILE = '../inventory/'.$HOST.".inv";
+    my $thistime = 0;
+
+    if (-e $HOSTFILE)
+    {
+      my @stats = stat(_);
+      $thistime = $stats[9];
+    }
+
+    $TIMES{ $thistime.";".$HOST } .= $ALLPLATTERS{$HOST}."<br>";
+  }
+
+  my $TIME;
+  my $LASTMOD;
+
+  foreach $TIME (sort {$b <=> $a} keys %TIMES)
+  {
+    my ($THISTIME, $HOST) = split (/;/, $TIME);
+    my $URL = $TIMES{$TIME};
+    my $TRAWLABLE = 1;
+
+    if ($TIME > 0)
+    {
+      $LASTMOD = localtime($TIME)." CET";
+      if (time - $TIME < (60*$MINTRAWL))
+      {
+        $TRAWLABLE = 0;
+      }
+    }
+    else
+    {
+      $LASTMOD = "Not trawled yet";
+    }
+
+    print "<tr><td>";
+
+    if ($TRAWLABLE)
+    {
+      print "<input type=\"checkbox\" name=\"".$HOST."\">";
+    }
+    else
+    {
+      print "&nbsp;";
+    }
+  print "</td>";
+  print "<td nowrap valign=bottom><b>".$HOST."</b></td>";
+  print "<td nowrap valign=bottom>&nbsp;<small>".$LASTMOD."</small></td></tr>\n";
+
+}
+
+print qq{  </tr>
+           <tr><td colspan=3><hr size=1 color=white></td></tr></table>
+         </form>
+         <form><input type="button" onClick="twat()" value="Trawl"></form>
+      </body>
+      </html>
+      };
+
+}
+exit 0;
+
+
+
+# sub to recursively trawl http directories for A tags
+# should store all the recognised files it can find.
+#
+# should be called with 1 parameter
+#  - path
+#
+sub parse_page
+{
+     my($URL, $NAME) = @_;
+     my $INVENTORY = "";
+     my @ARTISTS = ();
+     my $localcount = 0;
+     $RECURSE++;
+
+     if ($RECURSE < 3) {
+        my $FOLDER;
+        if ($RECURSE == 1) {
+             $FOLDER = $NAME."'s platter";
+         }
+         else {
+             $URL =~ /.*\/([^\/]+)\/*$/;
+             $FOLDER = makeNice($1);
+         }
+         print "<br><span id=\"line".$LINECOUNT."\">Indexing <b>".$FOLDER.":</b></span><script>document.getElementById(\"line".$LINECOUNT++."\").scrollIntoView(false);</script>\n";
+     }
+
+
+     #request page
+     my $REQUEST = HTTP::Request->new('GET', $URL , $HEADER);
+     my $RESPONSE = $USERAGENT->request($REQUEST);
+     #if response is successful
+     if ($RESPONSE->is_success)
+     {
+          #parse page that is returned
+      require HTML::Parse;
+      my $HTML = HTML::Parse::parse_html($RESPONSE->content);
+      my $base = $RESPONSE->base;
+      for ( @{ $HTML->extract_links } )
+      {
+           my($LINK, $elem) = @$_;
+           my $TAG = $elem->tag;
+
+           my $LINK = URI->new($LINK)->abs($base)->as_string;
+           #$LINK = makeUnNice($LINK);
+           $LINK =~ s/%25/%/g;
+           my $OUTPUT = $LINK;
+           $OUTPUT =~ s/(.*?)$BASE//;
+           my $EXT;
+           if ($OUTPUT =~ /\.(\w*)$/) { $EXT = $1;}
+
+           if ($TAG =~ /[Aa]/ && $URL !~ /\Q$LINK\E\/?.*/ && $LINK !~ /\?.*/)
+           {
+          		if ($debug) { print "<br>$LINK\n" }
+                if (
+                     $QUICKEXT =~ /$delim$EXT$delim/i
+                   ||
+                     $OUTPUT =~ /(artist|cover)[^\/]*\.(jpg|gif|png)$/i
+                   ||
+                     $OUTPUT =~ /.+\/info\.txt$/i
+                   )
+                {
+                     #if the file is recognised then
+
+                     #add link to inventory
+                     my @entry = makeInvLine($OUTPUT);
+
+                     $INVENTORY .= join($delim, @entry)."$delim$OUTPUT\n";
+                     push (@ARTISTS, $entry[0]) if $entry[0];
+                     push (@ARTISTS, $entry[1]) if $entry[1];
+
+                     if ($QUICKEXT =~ /$delim$EXT$delim/i) {$localcount++; }
+                }
+                elsif ($OUTPUT =~ /.abc$/i)
+                {
+                     if ($OUTPUT =~ /([^\/]+)\/([^\/]+\.abc)$/)
+                     {
+                       my $PATH = makeNice($1); my $ARTIST = makeNice($2);
+                       $ABCS{$PATH} = $ARTIST;
+                     }
+                }
+                elsif ($OUTPUT =~ /^\/info\.txt$/i)
+                {    # news page
+                    my $NEWSREQUEST = HTTP::Request->new('GET', $LINK, $HEADER);
+                    my $NEWSRESPONSE = $USERAGENT->request($NEWSREQUEST);
+                    if ($NEWSRESPONSE->is_success)
+                    {
+                        my $lastmod = $NEWSRESPONSE->headers()->last_modified;
+                        open(NEWS, ">../inventory/$NAME.news");
+                        print NEWS $lastmod."\n";
+                        print NEWS $NEWSRESPONSE->content;
+                        close(NEWS);
+                        print "<br>Fetched news page from $NAME";
+                    }
+                    else
+                    {
+                        print '<br><b  style="color: red">Code '.$RESPONSE->code()." error  fetching news page from $NAME: ".$RESPONSE->message().'</b>';
+                    }
+                }
+                else
+                {
+                  #print "<br>$LINK\n";
+                  if ($LINK =~ /\Q$URL\E/i # previous link is a substring of the new one
+                  && !($LINK=~ /\.\w{3}$/)) # do not retrieve anything with a dot and three chars at the end
+                  {
+           if ($RECURSE < 5) # limit navigation to 5 folders deep - and that's taking the piss
+             {
+               my @CHILD = parse_page($LINK);
+               $INVENTORY.= shift(@CHILD);
+               $localcount += shift(@CHILD);
+               push(@ARTISTS, @CHILD);
+             }
+           else
+             {
+               print "<h3>Recursion limit hit, will not trawl child folders.<br>".
+               "Please check the format of your platter and contact Platter Support</h3>";
+             }
+
+                  }
+                }
+           }
+          }
+     }
+     else {
+        print '<br><b  style="color: red">Code '.$RESPONSE->code().' error indexing '.makeNice($URL).': '.$RESPONSE->message().'</b>';     }
+
+     if ($RECURSE == 2) {
+        print " $localcount tracks found.";
+        }
+     $RECURSE--;
+     return ($INVENTORY, $localcount, @ARTISTS);
+}
+
+sub makeInvLine( @_ ) {
+    my @items = split('/', $_[0]);
+    shift @items; # remove leading slash
+
+    # get artist folder
+    my $folder;
+    if ($#items > 0) {
+        $folder = shift @items;
+        $folder =~ s/^(The%20|)(.*?)(,%20The|)$/$2/i;
+    }
+
+    # get track
+
+    my $track = pop(@items);
+    
+    # square brackets weren't seen for 6 years, but when they arrived they broke it
+    $track =~ s/\[/%5B/g;
+    $track =~ s/\]/%5D/g;
+
+
+    # get track artist
+    my $trackartist;
+    if ($track =~ /^(\d{2}%20-%20|)(The%20|)([^\/]*?)(,%20The|)%20-%20/i
+        && !($3 =~ /^\d{2}$/)
+        && length $3 > 0) {
+        $trackartist = $3;
+        $track =~ s/(The%20|)$trackartist(,%20The|)%20\-%20(.*?)/$3/;
+    }
+
+
+    # get album
+    my $album; my $year;
+    if ($#items > -1) {
+        $album = join('/', @items);
+
+        # get year
+        if ($album =~ /%20(\(|%28)(\d{4})(\)|%29)(\/|$)/) {
+            $year = $2;
+        }
+    }
+
+    return (
+        makeNice($folder),
+        makeNice($trackartist),
+        makeNice($album),
+        $year,
+        makeNice($track));
+
+}