Basic Google Cache Parser
This script downloads the cached versions from a Google results page.
#!C:/Perl/bin/Perl
#replace first line with "#!/usr/bin/perl" on a Linux/Unix operating system.
##############################################
# BASIC GOOGLE CACHE PARSER #
# #
# topictracer.com #
# Creative Commons Attribution 2.5 License #
##############################################
require LWP::UserAgent;
my $ua = LWP::UserAgent->new(keep_alive => 1,
agent => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
timeout => 10);
#Google seems to block lwp-trivial, so we use a proper user agent
open(IN, "queryresults.html") or die("cannot open file - $!");
#open file which contains the results of a google search query (whole html page)
@lines=<IN>;
$text = join "\n", @lines;
#make scalar
@hrefs=($text =~ m|href\s*=\s*\"([^\"]+)\"|ig);
#regular expression match for any urls, save to array "hrefs"
$i = 1;
foreach $href (@hrefs) {
if ($href =~ m|search\?q=cache:|ig) #parse it if it's a Google cache link (basic check)
{
if (length($href)<70) { print $href; } else { print substr($href,0,67)."..."; }
#strip URL down in length to make the output look pretty
my $request = HTTP::Request->new(GET => $href);
my $response = $ua->request($request);
if ($response->is_success)
{
if(defined $response->content())
{
open(OUT, ">$i.html") or die("cannot write to file - $!");
print OUT $response->content();
close(OUT);
print " saved.\n";
#save content to new file 1.html, 2.html, etc.
}
}
}
$i++;
}
close(IN);