Basic Google Cache Parser

This script downloads the cached versions from a Google results page.

#!C:/Perl/bin/Perl
#replace first line with "#!/usr/bin/perl" on a Linux/Unix operating system.

##############################################
#        BASIC GOOGLE CACHE PARSER           #
#                                            #
#             topictracer.com                #
#  Creative Commons Attribution 2.5 License  #
##############################################

require LWP::UserAgent;
my $ua = LWP::UserAgent->new(keep_alive => 1,
         agent => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
                             timeout => 10);
#Google seems to block lwp-trivial, so we use a proper user agent

open(IN, "queryresults.html") or die("cannot open file - $!");
#open file which contains the results of a google search query (whole html page)

@lines=<IN>;
$text = join "\n", @lines;
#make scalar

@hrefs=($text =~ m|href\s*=\s*\"([^\"]+)\"|ig);
#regular expression match for any urls, save to array "hrefs"

$i = 1;
foreach $href (@hrefs) {
   if ($href =~ m|search\?q=cache:|ig) #parse it if it's a Google cache link (basic check)
   {
    if (length($href)<70) { print $href; } else { print substr($href,0,67)."..."; }
    #strip URL down in length to make the output look pretty
    my $request = HTTP::Request->new(GET => $href);
 my $response = $ua->request($request);
 if ($response->is_success)
 {
   if(defined $response->content())
   {
        open(OUT, ">$i.html") or die("cannot write to file -  $!");
     print OUT $response->content();
        close(OUT);
        print " saved.\n";
        #save content to new file 1.html, 2.html, etc.
     }
 }
   }
   $i++;
}
close(IN);

Source: (2006-11-15 13:31:31)