# indexSite.pl # # Mark L. Irons # 16-18 March 2001 # # # Process a list of HTML files, indexing the keywords listed # in the tag. # # The output is an HTML page presenting a human-readable # keyword index. # # HTML files with a tag that indicates a page # shouldn't be indexed (e.g., CONTENT="none" or "noindex") will # not be indexed. # # INPUT a list of HTML files to index. # # OUTPUT an index, in HTML. # # KNOWN BUGS # # If a term to index is a Perl keyword, is the name of one of the # variables in this script, or has special meaning in a regular # expression, then this script may create erroneous entries. # Avoid using '*', "termsToIndex", and so on as keywords in # your HTML pages' META tags. # # REVISION HISTORY # # 2003-06-28 Changed keyword and description sorts to be # case-insensitive. # #-------------------------------------------------------------------- # # Patterns to match. # # METAKeywordsPattern looks like # '' # $METAKeywordsPattern = ''; # # METADescriptionPattern looks like # '' # $METADescriptionPattern = ''; # # Pattern for META tags that prevent robots from indexing a page # $METARobotNoIndexPattern = ''; # #-------------------------------------------------------------------- # # Get current date and convert it to ISO format # ($s,$m,$h,$day,$month,$year,$w,$y,$d) = localtime; $year=$year+1900; # living in 21st century, not 20th $month=$month+1; # Jan is 1st month, not 0th if ($month < 10) { $month = "0".$month; } # pad month to two digits if ($day < 10) { $day = "0".$day; } # pad day to two digits $ISOdate = $year."-".$month."-".$day; # put it all together # #-------------------------------------------------------------------- # # Array of months # @months = ('January','February','March','April','May','June', 'July','August','September','October','November','December'); # #-------------------------------------------------------------------- # # Data structures / variables # @ignoredKeywords = ( "Mark Irons", "Mark L. Irons", "half", "Half", "mark irons", "mark l. irons" ); # # 'entries' holds all the keywords to index on. It's an associative # array, initially empty. The entries have no values; we just use # an associative array to hold the keywords so that we can do fast # lookups to see if a given keyword's already in the list. # %entries = ( ); # # Likewise, 'files' is an associative array that's indexed by a URL. # The value for a given URL is the META description of that file. # %files = { }; # #-------------------------------------------------------------------- # # Loop over files, processing each. # PROCESSFILE: while (<>) { chop; $keywords = ""; $description = ""; $filename = $_; $filename =~ s/\.\///; # remove leading ./ if (!open(HTMLFILE,$filename)) { chop $filename; warn "Can't open $filename, skipping: $!\n"; next; } while () { if (/$METARobotNoIndexPattern/) { next PROCESSFILE; } elsif (/$METAKeywordsPattern/) { # check for META keywords tag $keywords = $1; $keywords =~ s/\s*,\s*/,/g; # remove spaces before & after commas @termsToIndex = split(',',$keywords); # split the keywords into an array $k = 0; foreach $i (@ignoredKeywords) { # remove keywords to ignore foreach $j (@termsToIndex) { if ($i eq $j) { splice(@termsToIndex,$k,1); last; } else { $k++; } } } } elsif (/$METADescriptionPattern/) { # check for META description tag $description = $1; } } # done processing single file close(HTMLFILE); # close the input file # # If the keywords and descriptions aren't null, process the keywords. # if (length($keywords)+length($description) > 0) { $files{$filename} = $description; # save file info foreach $kw (@termsToIndex) { # for each keyword... if (!defined $entries{$kw}) { # if not seen before $entries{$kw} = ''; # add it to keyword list } # push(@$kw,$filename); # add file to that kw's list } } } # done all files # # Write the index file. # print STDOUT <<"EndOfHTMLPreamble1"; Index to Mark L. Irons' Web site EndOfHTMLPreamble1 print STDOUT " \n"; print STDOUT <<"EndOfHTMLPreamble2";

Site Index


EndOfHTMLPreamble2 # # Loop over keywords, writing HTML # foreach $key (sort {uc($a) cmp uc($b)} keys(%entries)) { print STDOUT "

",$key,"

\n\n"; } print STDOUT <<"EndOfHTMLPostamble1";
EndOfHTMLPostamble1 # # Print readable date # if (substr($day,0,1) eq "0") { # remove leading 0 in day $day = substr($day,1,1); } $month = $months[$month-1]; $readableDate = "$day $month $year"; print STDOUT "

Last updated $readableDate
"; print STDOUT <<"EndOfHTMLPostamble2"; http://www.rdrop.com/~half/siteIndex.html
EndOfHTMLPostamble2 print STDOUT "All contents of this Web site ©2001-$year Mark L. Irons.

\n
"; print STDOUT <<"EndOfHTMLPostamble3"; EndOfHTMLPostamble3