# mindshare, Jon Udell, udell@monad.net, http://udell.roninhouse.com/ # # This script unrolls a Yahoo category to create a list of sites, # then asks AltaVista how many pages point to each site in the list. # In effect, it measures the Web mindshare of the sites in this category. # # If you use this script, please do so judiciously, # with respect for Yahoo and AltaVista -- two of the Net's # most valuable resources. # #!/usr/bin/perl -w use LWP::Simple; my $host = "http://dir.yahoo.com"; #my $root = "/Computers_and_Internet/Software/Operating_Systems/Windows/"; #my $root = "/Science/Nanotechnology/"; my $root = "/Computers_and_Internet/News_and_Media/Magazines/"; #my $root = "/Business_and_Economy/Companies/Computers/Business_to_Business/Software/Internet/World_Wide_Web/"; my $node_pat = "
  • ]+>"; my $leaf_pat = "
  • ]+>"; my %seen = (); my %sites = (); my %shares = (); my $domchars = "[a-zA-Z0-9\-]"; # build a hashtable of sites and titles traverse($root); # build a hashtable of mindshare numbers for each site foreach $site (sort keys %sites) { $site =~ s/^[^*]+\*//g; $site =~ m#($domchars+\.$domchars+)(/|$)#; my $dom = $1; # not perfect: works for .com, not .co.uk, .edu.au, etc. my $mindshare = mindshare($site,$dom,$sites{$site}); $shares{$site} = $mindshare; } # print results ordered by mindshare print "\n"; my $ord = 0; foreach $site (sort bynum keys %shares) { $ord++; print sprintf ("\n" ); } print "
    $sites{$site}$shares{$site}$ord
    \n"; sub traverse { my ($root) = @_; my $raw = get "$host$root"; my $leaf_or_node = ''; my $description = ''; my $leaf_or_node_addr = ''; while ( $raw =~ m#($node_pat|$leaf_pat)(.+)
    #g ) { $leaf_or_node = $1; $title = $2; $leaf_or_node =~ m#\"([^\"]+)\"#; $leaf_or_node_addr = $1; $leaf_or_node_addr =~ s#
  • ##g; $leaf_or_node_addr =~ s/^[^*]+\*//; print STDERR "$leaf_or_node_addr\n"; next if ( $leaf_or_node_addr =~ m#yahoo.com# ); if ( defined $seen{$leaf_or_node_addr} ) { print STDERR "seen: $leaf_or_node_addr, $seen{$leaf_or_node_addr}\n"; $seen{$leaf_or_node_addr}++; next; } else { $seen{$leaf_or_node_addr} = 1; } if ( $leaf_or_node_addr !~ m#^http# ) { if (substr($leaf_or_node_addr,0,1) eq '/') { traverse ($leaf_or_node_addr); } else { traverse("$root$leaf_or_node_addr"); } } else { # print STDERR "\"$leaf_or_node_addr\" => \"$title\"\n"; $site = $leaf_or_node_addr; $site =~ s#http://##; $sites{$site} = $title; } } } sub mindshare { my ($site,$dom,$title) = @_; my $result = get "http://www.altavista.com/cgi-bin/query?pg=q&kl=XX&q=link%3A$site+-url%3A$dom"; # my $result = get "http://www.altavista.com/cgi-bin/query?pg=q&kl=XX&q=link%3A$site"; my $count = 0; if ( $result =~ m#About ([,\d]+) pages# ) { $count = $1; $count =~ s/,//; } print STDERR "$dom\t$site\t$title\t$count\n"; return $count; } sub bynum { return $shares{$b} <=> $shares{$a}; }