KinoSearch::Docs::Tutorial.3pm

Langue: en

Version: 2008-11-10 (ubuntu - 08/07/09)

Section: 3 (Bibliothèques de fonctions)

NAME

KinoSearch::Docs::Tutorial - sample indexing and search applications

DESCRIPTION

The following sample code for invindexer.plx and search.cgi can be used to create a simple search engine. It requires the html presentation of the US Constitution included in the distribution for KinoSearch, under "t/us_constitution".

Note that a proper indexer for html documents would not rely on quick-n-dirty regular expressions for stripping tags, as this one does for the sake of brevity --- it would use a dedicated parsing module such as HTML::Parser.

invindexer.plx

     #!/usr/bin/perl
     use strict;
     use warnings;
     
     use File::Spec;
     use KinoSearch::InvIndexer;
     use KinoSearch::Analysis::PolyAnalyzer;
     
     ### In order for invindexer.plx to work correctly, you must modify 
     ### $source_dir, $path_to_invindex, and possibly $base_url.
     ###
     ### $source_dir must lead to the directory containing the US
     ### Constitution html files.
     ###
     ### $path_to_invindex is the future location of the invindex.
     ###
     ### $base_url should reflect the location of the us_constitution directory
     ### when accessed via a web browser.
     my $source_dir       = '';
     my $path_to_invindex = '';
     my $base_url         = '/us_constitution';
     
     opendir( my $source_dh, $source_dir )
         or die "Couldn't opendir '$source_dir': $!";
     my @filenames = grep {/\.html/} readdir $source_dh;
     closedir $source_dh or die "Couldn't closedir '$source_dir': $!";
     
     ### STEP 1: Choose an Analyzer.
     my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( 
         language => 'en',
     );
 
     ### STEP 2: Create a InvIndexer object.
     my $invindexer = KinoSearch::InvIndexer->new(
         analyzer => $analyzer,
         invindex => $path_to_invindex,
         create   => 1,
     );
     
     ### STEP 3: Define fields.
     $invindexer->spec_field( name => 'title' );
     $invindexer->spec_field( 
         name       => 'bodytext',
         vectorized => 1,
     );
     $invindexer->spec_field(
         name    => 'url',
         indexed => 0,
     );
     
     foreach my $filename (@filenames) {
         next if $filename eq 'index.html';
         my $filepath = File::Spec->catfile( $source_dir, $filename );
         open( my $fh, '<', $filepath )
             or die "couldn't open file '$filepath': $!";
         my $content = do { local $/; <$fh> };
     
         ### STEP 4: Start a new document.
         my $doc = $invindexer->new_doc;
     
         $content =~ m#<title>(.*?)</title>#s
             or die "couldn't isolate title in '$filepath'";
         my $title = $1;
         $content =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
             or die "couldn't isolate bodytext in '$filepath'";
         my $bodytext = $1;
         $bodytext =~ s/<.*?>/ /gsm;    # quick and dirty tag stripping
     
         ### STEP 5: Set the value for each field.
         $doc->set_value( url      => "$base_url/$filename" );
         $doc->set_value( title    => $title );
         $doc->set_value( bodytext => $bodytext );
     
         ### STEP 6 Add the document to the invindex.
         $invindexer->add_doc($doc);
     
         ### STEP 7 Repeat steps 3-5 for each document in the collection.
     }
     
     ### STEP 8 Finalize the invindex.
     $invindexer->finish;
 
 

search.cgi

     #!/usr/bin/perl -T
     use strict;
     use warnings;
     
     use CGI;
     use List::Util qw( max min );
     use POSIX qw( ceil );
     use KinoSearch::Searcher;
     use KinoSearch::Analysis::PolyAnalyzer;
     use KinoSearch::Highlight::Highlighter;
     
     my $cgi           = CGI->new;
     my $q             = $cgi->param('q');
     my $offset        = $cgi->param('offset');
     my $hits_per_page = 10;
     $q      = '' unless defined $q;
     $offset = 0  unless defined $offset;
     
     ### In order for search.cgi to work, $path_to_invindex must be modified so
     ### that it points to the invindex created by invindexer.plx, and
     ### $base_url may have to change to reflect where a web-browser should
     ### look for the us_constitution directory.
     my $path_to_invindex = '';
     my $base_url         = '/us_constitution';
     
     ### STEP 1: Specify the same Analyzer used to create the invindex.
     my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( 
         language => 'en', 
     );
     
     ### STEP 2: Create a Searcher object.
     my $searcher = KinoSearch::Searcher->new(
         invindex => $path_to_invindex,
         analyzer => $analyzer,
     );
     
     ### STEP 3: Feed a query to the Search object.
     my $hits = $searcher->search($q);
     
     ### STEP 4: Arrange for highlighted excerpts to be created.
     my $highlighter = KinoSearch::Highlight::Highlighter->new( 
         excerpt_field => 'bodytext' );
     $hits->create_excerpts( highlighter => $highlighter );
     
     ### STEP 5: Process the search.
     $hits->seek( $offset, $hits_per_page );
     
     ### STEP 6: Format the results however you like.
     
     # create result list
     my $report = '';
     while ( my $hit = $hits->fetch_hit_hashref ) {
         my $score = sprintf( "%0.3f", $hit->{score} );
         $report .= qq|
             <p>
                 <a href="$hit->{url}"><strong>$hit->{title}</strong></a>
                 <em>$score</em>
                 <br>
                 $hit->{excerpt}
                 <br>
                 <span class="excerptURL">$hit->{url}</span>
             </p>
             |;
     }
     
     $q = CGI::escapeHTML($q);
     
     # display info about the number of hits, paging links
     my $total_hits = $hits->total_hits;
     my $num_hits_info;
     if ( !length $q ) {
         # no query, no display
         $num_hits_info = '';
     }
     elsif ( $total_hits == 0 ) {
         # alert the user that their search failed
         $num_hits_info = qq|<p>No matches for <strong>$q</strong></p>|;
     }
     else {
         # calculate the nums for the first and last hit to display
         my $last_result = min( ( $offset + $hits_per_page ), $total_hits );
         my $first_result = min( ( $offset + 1 ), $last_result );
     
         # display the result nums, start paging info
         $num_hits_info = qq|
             <p>
                 Results <strong>$first_result-$last_result</strong> 
                 of <strong>$total_hits</strong> for <strong>$q</strong>.
             </p>
             <p>
                 Results Page:
             |;
     
         # calculate first and last hits pages to display / link to
         my $current_page = int( $first_result / $hits_per_page ) + 1;
         my $last_page    = ceil( $total_hits / $hits_per_page );
         my $first_page   = max( 1, ( $current_page - 9 ) );
         $last_page = min( $last_page, ( $current_page + 10 ) );
     
         # create a url for use in paging links
         my $href = $cgi->url( -relative => 1 ) . "?" . $cgi->query_string;
         $href .= ";offset=0" unless $href =~ /offset=/;
     
         # generate the "Prev" link;
         if ( $current_page > 1 ) {
             my $new_offset = ( $current_page - 2 ) * $hits_per_page;
             $href =~ s/(?<=offset=)\d+/$new_offset/;
             $num_hits_info .= qq|<a href="$href">&lt;= Prev</a>\n|;
         }
     
         # generate paging links
         for my $page_num ( $first_page .. $last_page ) {
             if ( $page_num == $current_page ) {
                 $num_hits_info .= qq|$page_num \n|;
             }
             else {
                 my $new_offset = ( $page_num - 1 ) * $hits_per_page;
                 $href =~ s/(?<=offset=)\d+/$new_offset/;
                 $num_hits_info .= qq|<a href="$href">$page_num</a>\n|;
             }
         }
     
         # generate the "Next" link
         if ( $current_page != $last_page ) {
             my $new_offset = $current_page * $hits_per_page;
             $href =~ s/(?<=offset=)\d+/$new_offset/;
             $num_hits_info .= qq|<a href="$href">Next =&gt;</a>\n|;
         }
     
         # finish paging links
         $num_hits_info .= "</p>\n";
     }
     
     # blast it all out
     print "Content-type: text/html\n\n";
     print <<END_HTML;
     <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
         "http://www.w3.org/TR/html4/loose.dtd">
     <html>
     <head>
         <meta http-equiv="Content-type" 
             content="text/html;charset=ISO-8859-1">
         <link rel="stylesheet" type="text/css" href="$base_url/uscon.css">
         <title>KinoSearch: $q</title>
     </head>
     
     <body>
     
         <div id="navigation">
             <form id="usconSearch" action="">
                 <strong>
                 Search the <a href="$base_url/index.html">US Constitution</a>:
                 </strong>
                 <input type="text" name="q" id="q" value="$q">
                 <input type="submit" value="=&gt;">
                 <input type="hidden" name="offset" value="0">
             </form>
         </div><!--navigation-->
     
         <div id="bodytext">
     
         $report
     
         $num_hits_info
     
         <p style="font-size: smaller; color: #666">
             <em>Powered by 
                 <a href="http://www.rectangular.com/kinosearch/">
                     KinoSearch
                 </a>
             </em>
         </p>
         </div><!--bodytext-->
     
     </body>
     
     </html>
     END_HTML
 
 
Copyright 2005-2007 Marvin Humphrey

LICENSE, DISCLAIMER, BUGS, etc.

See KinoSearch version 0.163.