#!/usr/bin/perl -w 
# extract google/msn/ask search terms from a http log
# referer logging needs to be enabled

use Getopt::Std;
our($opt_n);
getopts('n') or die "unknown option\n";

while (<>) {

my ($ip, $req, $ref, $client);
($req, $ref, $client) = /"([^"]*)" \S+ \S+ "([^"]*)" "([^"]*)"/;
($ip) = /(\d+\.\d+\.\d+\.\d+)/;

next unless defined($ref);
next if $ref eq "-";

if (($ref =~ /google\.\S+\/search/) || 
    ($ref =~ /google\.\S+\/scholar/) ||
    ($ref =~ /search\.msn\.com\/search/) ||
    ($ref =~ /search\.live\.com\/search/) ||
    ($ref =~ /search\.yahoo\..*\/search/) ||
    ($ref =~ /www\.ask\.com\/web/)) {
	if ($ref =~ /translate\.google.*prev=(.*)/) { 
		$t = $1;
		$t =~ s/%([0-9a-fA-F][0-9a-fA-F])/chr(hex($1))/eg;
		$t =~ s/^\/search\?q=//;
		$q = $t;
	} elsif (($ref =~ /as_epq=(.*)/) ||
		 ($ref =~ /[^a-z]q=(.*)$/)) { 
		$q = $1;
	} elsif ($ref =~ /p=(.*)$/) { 
		$q = $1;
	} else { 
		print "cannot parse search string \"$ref\"\n";
		next;
	}
	$q =~ s/\&.*//;
	$q =~ s/\+/ /g;
	$q =~ s/%([0-9a-fA-F][0-9a-fA-F])/chr(hex($1))/eg;
	print "$ip " if $::opt_n;
	print $q,"\n";
} 

}
