#!/bin/perl
# Script by Rich Waters
# Last updated: 6/3/05
# Purpose: to scan for and download an entire archive of penny-arcade comics.  Also has an update function to grab only the most recent comics
use Tie::File;
use LWP::RobotUA;
use LWP::Simple;
use Getopt::Long;
use strict;
use warnings;

my $VERSION = 0.2;
my $DEBUGPIC = 0;
my $GETDATES = 1;
my $LOGERROR = 0;
my $DEBUGDATA = 0;
my $DEBUGCOUNT = 0;
my $FILENAME = "dates.txt";

my %command = ();
my (@file, @ERROR);
if ($LOGERROR) { 
	tie @ERROR, 'Tie::File', 'Error.txt';
}

Getopt::Long::Configure("prefix_pattern=(-|\/)");
GetOptions(	\%command,qw(begindate|b=i enddate|e=i overwrite|o update|u verbose|v logerror|l help|?|h));

if (!exists $command{begindate}) {
	$command{begindate} = 1998;
}
if (!exists $command{enddate}) {
	my @date = localtime(time);
	$command{enddate} = $date[5]+1900;
}
if (exists $command{help}) {
	&syntax;
	exit;
}
if (exists $command{logerror}) {
	$LOGERROR = 1;
}
if (exists $command{update}) {
	my @date = localtime(time);
	$command{begindate} = $date[5]+1900;
	$command{enddate} = $date[5]+1900;
	$command{overwrite} = 1;
	$FILENAME = "update.txt";
}

my $ua = LWP::RobotUA->new('NefApp/0.1', 'nef@lar.com');
$ua->delay(5/60); # Be semi - nice, set delay to 5 seconds
my $skip = 0;

# Could probably tone down this loop a bit now, it double checks the two places on the page where the dates are listed
# Really only need to check the anchor tags... possibly in a future release
if ($GETDATES) {
	if (-e $FILENAME) { 
		if (exists $command{overwrite}) {
			if (exists $command{verbose}) { print "deleting $FILENAME\n"; }
			unlink($FILENAME);
		} else {
			$skip = 1;
		}
	}
	tie @file, 'Tie::File', $FILENAME;
	if (!$skip) {
		if (exists $command{verbose}) { print "opened $FILENAME for writing\n"; }
		print "$command{begindate} - $command{enddate}\n";
		for (my $i = $command{begindate};$i <= $command{enddate};$i++) {
			if (exists $command{verbose}) { print "Reading $i\n"; }
			my $innercount = 0;
			my $outercount = 0;
			my @temparr = ();
			my @temparr2 = ();
			my (@tsplit,@tsplit2,$spl);
			my $found =0;
			#Calendar contains a complete listing of all the dates comics were released
			my $res = $ua->get("http://www.penny-arcade.com/calendar.php?year=$i");
			if ($res->is_success) {
				my @lines = split($/,$res->content);
				foreach (@lines) {
					my @splitline = split("<A",$_);
					if (/<!-- (\d+)\/(\d+) -->/g) {
						if ($i == 2004 && $1 == 9 && $2 == 31) {  #Ghey, site messed up, there is no 9/31/04 comic
							next;
						} else {
							$outercount++;
							#print "$1-$2\n";
							push (@temparr2,$1."-~".$2) 
						} 
					}
					foreach $spl (@splitline) {
						if (($spl =~ /HREF=\"view.php3\?date=(\d+)-(\d+)-(\d+)&res=l\" title=\"([A-Za-z0-9_ ,\.'`’?!:\?\(\)\[\]=\>\<\/\"\$\+&#©-]+)\" onmouse/)) {
							$found = 0;
							#print "$2-$3\n";
							push( @temparr,$2."-~".$3."-~".$4);
							$innercount++;
						}
					}				
				}
				foreach my $tmp2 (@temparr2) {
						$found=0;
						@tsplit2 = split("-~",$tmp2);
						if ($DEBUGDATA) {
							$, = "\t";
							print "************OUTER*********** $i\n";
							print @tsplit2,"\n";
						}
						foreach my $tmp (@temparr) {
							@tsplit = split("-~",$tmp);
							if ($tsplit2[0] == $tsplit[0] && $tsplit2[1] == $tsplit[1]) {
								$found=1;
								push (@file, "$i-~$tsplit2[0]-~$tsplit2[1]-~$tsplit[2]");
								if ($DEBUGDATA) { print "INN:",@tsplit,"\n"; }
								last;
							}
						}
					if (!$found) { print "$tsplit2[0]-$tsplit2[1] - ERROR\n"; }
				}			
			} else {
			  print $res->status_line, "\n";
			}
			if ($DEBUGCOUNT) { print "$i - inner=$innercount outer=$outercount\n"; }
		}
	}
}
if ($DEBUGCOUNT) { <>; }
print "Listings Read from Web Site\n";
foreach (@file) {
	my @data = split("-~",$_);
	my $picName;
	# generate the appropriate filename based off the date
	if (($data[0] == 2003 && $data[1] == 11 && $data[2] == 12) || #another PA screwup, high res pic messed
		($data[0] == 2002 && $data[1] == 1 && $data[2] == 6)){ # I knew it would happen again!
		# left this as a sprintf instead of directly naming the file in case of any other future problems - also was a gif, but lets let the other if catch it
		$picName = sprintf("%s/%s%02s%02sl.jpg",$data[0],$data[0],$data[1],$data[2]);
	} else {
		$picName = sprintf("%s/%s%02s%02sh.jpg",$data[0],$data[0],$data[1],$data[2]);
	}
	$data[3] =~ s/[<>\\\/\*\?\"\|:]/~/g; # Substitute < > \ / * ? " | : with ~ to make a workable filename
	$data[3] =~ s/(\s\s+)/ /g; # Substitute more than one space with just one (also messes up windows file naming)
	my $saveName = sprintf("%s/%s-%02s-%02s-%s.jpg",$data[0],$data[0],$data[1],$data[2],$data[3]);
	if (!-e $data[0]) { #Make sure directory exists, if not create it
		mkdir($data[0]);
	}
	
	if ($DEBUGPIC) {
		print "$picName\n";
	} else {
		if (!-e $saveName) {
			my $res = getstore("http://img.penny-arcade.com/".$picName,$saveName);
			if( &IndicatesSuccess($res))
			{
				if (exists $command{verbose}) { print "Finished: file successfully downloaded to $saveName\n"; }
			} else { # Theres some messed up files, try .gif extention
				$saveName =~ s/\.jpg$/\.gif/;
				$picName =~ s/\.jpg$/\.gif/;
				if (!-e $saveName) {
					$res = getstore("http://img.penny-arcade.com/".$picName,$saveName);
					if ( &IndicatesSuccess($res)) {
						if (exists $command{verbose}) { print "Finished: file successfully downloaded to $saveName\n"; }
					} else { #Good lord they can't keep things consistant, found two comics with .GIF - yes has to be caps
						$saveName =~ s/\.gif/\.GIF/;
						$picName =~ s/\.gif/\.GIF/;
						if (!-e $saveName) {
							$res = getstore("http://img.penny-arcade.com/".$picName,$saveName);
							if (&IndicatesSuccess($res)) {
								if (exists $command{verbose}) { print "Finished: file sucessfully downloaded to $saveName\n"; }
							} else {
								if ($LOGERROR) { push(@ERROR, "ERR: Could not find $picName on the server\n"); }
								print "ERROR: Could Not find picture $picName on the server\n";
							}
						}
					}
				} else {
					if (exists $command{verbose}) { print "File Skipped, Already Exists: $saveName\n"; }
				}
			}
		} else {
			if (exists $command{verbose}) { print "File Skipped, Already Exists: $saveName\n"; }
		}
	}
}	
	
untie @file;
if ($LOGERROR) { untie @ERROR; }

sub IndicatesSuccess()
{
	my $Response = shift();
	if($Response =~ /2\d\d/)
	{
		return(1);
	}
	else
	{
		return(0);
	}
}

sub syntax {
	my ($Script) = ( $0 =~ m#([^\\/]+)$# );
	my $Line = "-" x length($Script);
	print <<EOM;
$Script
$Line
Script to create an archive of Penny-Arcade comics
Version: $VERSION
Syntax: 
    $Script [-b <beginyear>] [-e <endyear>] [-o] [-u] [-v] [-h|?]

    -b <beginyear>...Specify the begining year to search for comics defaults
                     to 1998 which is the first year they started.
    -e <endyear>.....Specify the ending year to use when searching for comics
                     defaults to current year.              
    -h or -?.........Help - displays this message.
    -o...............Overwrite mode, program will not overwrite dates.txt
                     by default, use this to reprobe the PA site for 
                     comic dates.
    -u...............Update - Quick check for this year to grab updates.
    -v...............Verbose - displays information as to what the program is 
                     doing to the screen.

(c) 2005 by Rich Waters neflar\@yahoo.com
EOM
}
