#!/usr/local/bin/perl -w
package SwishSearch;
use strict;
# an optional file loading special environment
BEGIN { eval { require "./apache.org-setup.pl" } }
use lib qw( modules ); ### This may need to be adjusted!
### It should point to the location of the
### associated script modules directory
my $DEFAULT_CONFIG_FILE = '.swishcgi.conf';
###################################################################################
#
# If this text is displayed on your browser then your web server
# is not configured to run .cgi programs. Contact your web server administrator.
#
# To display documentation for this program type "perldoc swish.cgi"
#
# swish.cgi $Revision: 66589 $ Copyright (C) 2001 Bill Moseley swishscript@hank.org
# Example CGI program for searching with SWISH-E
#
# This example program will only run under an OS that supports fork().
# Ok, piped opens.
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version
# 2 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# The above lines must remain at the top of this program
#
# $Id: swish.cgi 66589 2002-07-01 05:08:43Z moseley $
#
####################################################################################
# This is written this way so the script can be used as a CGI script or a mod_perl
# module without any code changes.
# use CGI (); # might not be needed if using Apache::Request
#=================================================================================
# CGI entry point
#
#=================================================================================
# Run the script -- entry point if running as a CGI script
unless ( $ENV{MOD_PERL} ) {
my $config = default_config();
# Merge with disk config file.
$config = merge_read_config( $config );
process_request( $config );
}
#==================================================================================
# This sets the default configuration parameters
#
# Any configuration read from disk is merged with these settings.
#
# Only a few settings are actually required. Some reasonable defaults are used
# for most. If fact, you can probably create a complete config as:
#
# return = {
# swish_binary => '/usr/local/bin/swish-e',
# swish_index => '/usr/local/share/swish/index.swish-e',
# title_property => 'swishtitle', # Not required, but recommended
# };
#
# But, that doesn't really show all the options.
#
# You can modify the options below, or you can use a config file. The config file
# is .swishcgi.conf by default (read from the current directory) that must return
# a hash reference. For example, to create a config file that changes the default
# title and index file name, plus uses Template::Toolkit to generate output
# create a config file as:
#
# # Example config file -- returns a hash reference
# {
# title => 'Search Our Site',
# swish_index => 'index.web',
#
# template => {
# package => 'TemplateToolkit',
# file => 'search.tt',
# options => {
# INCLUDE_PATH => '/home/user/swish-e/example',
# },
# };
#
#
#-----------------------------------------------------------------------------------
sub default_config {
##### Configuration Parameters #########
#---- This lists all the options, with many commented out ---
# By default, this config is used -- see the process_request() call below.
# You should adjust for your site, and how your swish index was created.
##>>
##>> Please don't post this entire section on the swish-e list if looking for help!
##>>
##>> Send a small example, without all the comments.
#======================================================================
# NOTE: Items beginning with an "x" or "#" are commented out
# the "x" form simply renames (hides) that setting. It's used
# to make it easy to disable a mult-line configuation setting.
#
# If you do not understand a setting then best to leave the default.
#
# Please follow the documentation (perldoc swish.cgi) and set up
# a test using the defaults before making changes. It's much easier
# to modify a working example than to try to get a modified example to work ;)
#
# Again, this is a Perl hash structure. Commas are important.
#======================================================================
my $swish_binary = $ENV{SWISH_BINARY_PATH} || './swish-e';
die "Cannot find swish-e at $swish_binary: $!" unless -x $swish_binary;
return {
title => 'Search perl.apache.org site', # Title of your choice. Displays on the search page
swish_binary => $swish_binary, # Location of swish-e binary
# By default, this script tries to read a config file. You should probably
# comment this out if not used save a disk stat
config_file => $DEFAULT_CONFIG_FILE, # Default config file
# The location of your index file. Typically, this would not be in
# your web tree.
# If you have more than one index to search then specify an array
# reference. e.g. swish_index =>[ qw/ index1 index2 index3 /],
swish_index => 'index.swish-e', # Location of your index file
# See "select_indexes" below for how to
# select more than one index.
page_size => 15, # Number of results per page - default 15
# Property name to use as the main link text to the indexed document.
# Typically, this will be 'swishtitle' if have indexed html documents,
# But you can specify any PropertyName defined in your document.
# By default, swish will return the pathname for documents that do not
# have a title.
# In other words, this is used for the text of the links of the search results.
# title_property
title_property => 'swishtitle',
# prepend this path to the filename (swishdocpath) returned by swish. This is used to
# make the href link back to the original document. Comment out to disable.
#prepend_path => 'http://localhost/mydocs',
# Swish has a configuration directive "StoreDescription" that will save part or
# all of a document's contents in the index file. This can then be displayed
# along with results. If you are indexing a lot of files this can use a lot of disk
# space, so test carefully before indexing your entire site.
# Building swish with zlib can greatly reduce the space used by StoreDescription
#
# This settings tells this script to display this description.
# Normally, this should be 'swishdescription', but you can specify another property name.
# There is no default.
description_prop => 'swishdescription',
# Property names listed here will be displayed in a table below each result
# You may wish to modify this list if you are using document properties (PropertyNames)
# in your swish-e index configuration
# There is no default.
display_props => [qw/swishlastmodified swishdocsize swishdocpath/],
# Results can be be sorted by any of the properties listed here
# They will be displayed in a drop-down list
# Again, you may modify this list if you are using document properties of your own creation
# Swish uses the rank as the default sort
sorts => [qw/swishrank swishlastmodified swishtitle swishdocpath/],
# Secondary_sort is used to sort within a sort
# You may enter a property name followed by a direction (asc|desc)
secondary_sort => [qw/swishlastmodified desc/],
# You can limit by MetaNames here. Names listed here will be displayed in
# a line of radio buttons.
# The default is to not allow any metaname selection.
# To use this feature you must define MetaNames while indexing.
# The special "swishdefault" says to search any text that was not indexed
# as a metaname (e.g. the body of a HTML document).
# To see how this might work, add to your config file:
# MetaNames swishtitle swishdocpath
# and try:
metanames => [qw/swishdefault swishtitle swishdocpath/],
# Another example: if you indexed an email archive
# that defined the metanames subject name email (as in the swish-e discussion archive)
# you might use:
#metanames => [qw/body subject name email/],
# Note that you can do a real "all" search if you use nested metanames in your source documents.
# Nesting metanames is most common with XML documents.
# These are used to map MetaNames and PropertyNames to user-friendly names
# on the form.
name_labels => {
swishdefault => 'Title & Body',
swishtitle => 'Title',
swishrank => 'Rank',
swishlastmodified => 'Last Modified Date',
swishdocpath => 'Document Path',
swishdocsize => 'Document Size',
subject => 'Message Subject',
name => "Poster's Name",
email => "Poster's Email",
sent => 'Message Date',
ALL => 'Message text',
},
timeout => 10, # limit time used by swish when fetching results - DoS protection.
max_query_length => 100, # limit length of query string. Swish also has a limit (default is 40)
# You might want to set swish-e's limit higher, and use this to get a
# somewhat more friendly message.
# These settings will use some crude highlighting code to highlight search terms in the
# property specified above as the description_prop (normally, 'swishdescription').
max_chars => 500, # If "highlight" is not defined, then just truncate the description to this many *chars*.
# If you want to go by *words*, enable highlighting,
# and then comment-out show_words. It will be a little slower.
# This structure defines term highlighting, and what type of highlighting to use
highlight => {
# Pick highlighting module -- you must make sure the module can be found
# Ok speed, but doesn't handle phrases.
#Deals with stemming, but not stopwords
#package => 'DefaultHighlight',
# Somewhat slow, but deals with phases, stopwords, and stemming.
# Takes into consideration WordCharacters, IgnoreFirstChars and IgnoreLastChars.
package => 'PhraseHighlight',
# Fast: phrases without regard to wordcharacter settings
# doesn't do context display, so must match in first X words,
# doesn't handle stemming or stopwords.
#package => 'SimpleHighlight',
show_words => 10, # Number of swish words words to show around highlighted word
max_words => 100, # If no words are found to highlighted then show this many words
occurrences => 6, # Limit number of occurrences of highlighted words
#highlight_on => '', # HTML highlighting codes
#highlight_off => '',
highlight_on => '',
highlight_off => '',
meta_to_prop_map => { # this maps search metatags to display properties
swishdefault => [ qw/swishtitle swishdescription/ ],
swishtitle => [ qw/swishtitle/ ],
swishdocpath => [ qw/swishdocpath/ ],
},
},
# If you specify more than one index file (as an array reference) you
# can set this allow selection of which indexes to search.
# The default is to search all indexes specified if this is not used.
# When used, the first index is the default index.
# You need to specify your indexes as an array reference:
#swish_index => [ qw/ index.swish-e index.other index2.other index3.other index4.other / ],
Xselect_indexes => {
#method => 'radio_group', # pick radio_group, popup_menu, or checkbox_group
method => 'checkbox_group',
#method => 'popup_menu',
columns => 3,
labels => [ 'Main Index', 'Other Index', qw/ two three four/ ], # Must match up one-to-one
description => 'Select Site: ',
},
# Similar to select_indexes, this adds a metaname search
# based on a metaname. You can use any metaname, and this will
# add an "AND" search to limit results to a subset of your records.
# i.e. it adds something like 'site=(foo or bar or baz)' if foo, bar, and baz were selected.
# Swish-e's ExtractPath would work well with this. For example, the apache docs:
# ExtractPath site regex !^/usr/local/apache/htdocs/manual/([^/]+)/.+$!$1!
# ExtractPathDefault site other
Xselect_by_meta => {
#method => 'radio_group', # pick: radio_group, popup_menu, or checkbox_group
method => 'checkbox_group',
#method => 'popup_menu',
columns => 3,
metaname => 'site', # Can't be a metaname used elsewhere!
values => [qw/misc mod vhosts other/],
labels => {
misc => 'General Apache docs',
mod => 'Apache Modules',
vhosts => 'Virutal hosts',
},
description => 'Limit search to these areas: ',
},
# The 'template' setting defines what generates the output
# The default is "TemplateDefault" which is reasonably ugly.
# Note that some of the above options may not be available
# for templating, as it's up to you do layout the form
# and results in your template.
xtemplate => {
package => 'TemplateDefault',
},
xtemplate => {
package => 'TemplateDumper',
},
xtemplate => {
package => 'TemplateToolkit',
file => 'search.tt',
options => {
INCLUDE_PATH => '/home/user/swish-e/example',
#PRE_PROCESS => 'config',
},
},
xtemplate => {
package => 'TemplateHTMLTemplate',
options => {
filename => 'swish.tmpl',
die_on_bad_params => 0,
loop_context_vars => 1,
cache => 1,
},
},
# The "on_intranet" setting is just a flag that can be used to say you do
# not have an external internet connection. It's here because the default
# page generation includes links to images on swish-e.or and on www.w3.org.
# If this is set to one then those images will not be shown.
# (This only effects the default ouput module TemplateDefault)
on_intranet => 0,
# Here you can hard-code debugging options. The will help you find
# where you made your mistake ;)
# Using all at once will generate a lot of messages to STDERR
# Please see the documentation before using these.
# Typically, you will set these from the command line instead of in the configuration.
# debug_options => 'basic, command, headers, output, summary, dump',
# This defines the package object for reading CGI parameters
# Defaults to CGI. Might be useful with mod_perl.
# request_package => 'CGI',
# request_package => 'Apache::Request',
# Minor adjustment to page display. The page navigation normally looks like:
# Page: 1 5 6 7 8 9 24
# where the first page and last page are always displayed. These can be disabled by
# by setting to true values ( 1 )
no_first_page_navigation => 0,
no_last_page_navigation => 0,
# Limit to date ranges
# This adds in the date_range limiting options
# You will need the DateRanges.pm module from the author to use that feature
# Noramlly, you will want to limit by the last modified date, so specify
# "swishlastmodified" as the property_name. If indexing a mail archive, and, for
# example, you store the date (a unix timestamp) as "date" then specify
# "date" as the property_name.
date_ranges => {
property_name => 'swishlastmodified', # property name to limit by
# what you specify here depends on the DateRanges.pm module.
time_periods => [
'All',
'Today',
'Yesterday',
#'Yesterday onward',
'This Week',
'Last Week',
'Last 90 Days',
'This Month',
'Last Month',
#'Past',
#'Future',
#'Next 30 Days',
],
line_break => 0,
default => 'All',
date_range => 1,
},
};
}
#^^^^^^^^^^^^^^^^^^^^^^^^^ end of user config ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#========================================================================================
#=================================================================================
# mod_perl entry point
#
# As an example, you might use a PerlSetVar to point to paths to different
# config files, and then cache the different configurations by path.
#
#=================================================================================
my %cached_configs;
sub handler {
my $r = shift;
if ( my $config_path = $r->dir_config( 'Swish_Conf_File' ) ) {
# Already cached?
if ( $cached_configs{ $config_path } ) {
process_request( $cached_configs{ $config_path } );
return Apache::Constants::OK();
}
# Else, load config
my $config = default_config();
$config->{config_file} = $config_path;
# Merge with disk config file.
$cached_configs{ $config_path } = merge_read_config( $config );
process_request( $cached_configs{ $config_path } );
return Apache::Constants::OK();
}
# Otherwise, use hard-coded config
process_request( default_config() );
return Apache::Constants::OK();
}
#============================================================================
# Read config settings from disk, and merge
# Note, all errors are ignored since by default this script looks for a
# config file.
#
#============================================================================
sub merge_read_config {
my $config = shift;
set_default_debug_flags();
set_debug($config); # get from config or from %ENV
return $config unless $config->{config_file};
my $return = do $config->{config_file}; # load the config file
unless ( ref $return eq 'HASH' ) {
# First, let's check for file not found for the default config, which we can ignore
my $error = $@ || $!;
if ( $config->{config_file} eq $DEFAULT_CONFIG_FILE && !-e $config->{config_file} ) {
warn "Config file '$config->{config_file}': $!" if $config->{debug};
return $config;
}
die "Config file '$config->{config_file}': $error";
}
if ( $config->{debug} || $return->{debug} ) {
require Data::Dumper;
print STDERR "\n---------- Read config parameters from '$config->{config_file}' ------\n",
Data::Dumper::Dumper($return),
"-------------------------\n";
}
set_debug( $return );
# Merge settings
return { %$config, %$return };
}
#--------------------------------------------------------------------------------------------------
sub set_default_debug_flags {
# Debug flags defined
$SwishSearch::DEBUG_BASIC = 1; # Show command used to run swish
$SwishSearch::DEBUG_COMMAND = 2; # Show command used to run swish
$SwishSearch::DEBUG_HEADERS = 4; # Swish output headers
$SwishSearch::DEBUG_OUTPUT = 8; # Swish output besides headers
$SwishSearch::DEBUG_SUMMARY = 16; # Summary of results parsed
$SwishSearch::DEBUG_DUMP_DATA = 32; # dump data that is sent to templating modules
}
#---------------------------------------------------------------------------------------------------
sub set_debug {
my $conf = shift;
unless ( $ENV{SWISH_DEBUG} ||$conf->{debug_options} ) {
$conf->{debug} = 0;
return;
}
my %debug = (
basic => [$SwishSearch::DEBUG_BASIC, 'Basic debugging'],
command => [$SwishSearch::DEBUG_COMMAND, 'Show command used to run swish'],
headers => [$SwishSearch::DEBUG_HEADERS, 'Show headers returned from swish'],
output => [$SwishSearch::DEBUG_OUTPUT, 'Show output from swish'],
summary => [$SwishSearch::DEBUG_SUMMARY, 'Show summary of results'],
dump => [$SwishSearch::DEBUG_DUMP_DATA, 'Show all data available to templates'],
);
$conf->{debug} = 1;
for ( split /\s*,\s*/, $ENV{SWISH_DEBUG} ) {
if ( exists $debug{ lc $_ } ) {
$conf->{debug} |= $debug{ lc $_ }->[0];
next;
}
print STDERR "Unknown debug option '$_'. Must be one of:\n",
join( "\n", map { sprintf(' %10s: %10s', $_, $debug{$_}->[1]) } sort { $debug{$a}->[0] <=> $debug{$b}->[0] }keys %debug),
"\n\n";
exit;
}
print STDERR "Debug level set to: $conf->{debug}\n";
}
#============================================================================
#
# This is the main entry point, where a config hash is passed in.
#
#============================================================================
sub process_request {
my $conf = shift; # configuration parameters
# Use CGI.pm by default
my $request_package = $conf->{request_package} || 'CGI';
$request_package =~ s[::][/]g;
require "$request_package.pm";
my $request_object = $conf->{request_package} ? $conf->{request_package}->new : CGI->new;
if ( $conf->{debug} ) {
print STDERR 'Enter a query [all]: ';
my $query = ;
$query =~ tr/\r//d;
chomp $query;
unless ( $query ) {
print STDERR "Using 'not asdfghjklzxcv' to match all records\n";
$query = 'not asdfghjklzxcv';
}
$request_object->param('query', $query );
print STDERR 'Enter max results to display [1]: ';
my $max = ;
chomp $max;
$max = 1 unless $max && $max =~/^\d+$/;
$conf->{page_size} = $max;
}
# create search object
my $search = SwishQuery->new(
config => $conf,
request => $request_object,
);
# run the query
my $results = $search->run_query; # currently, results is the just the $search object
if ( $conf->{debug} ) {
if ( $conf->{debug} & $SwishSearch::DEBUG_DUMP_DATA ) {
require Data::Dumper;
print STDERR "\n------------- Results structure passed to template ------------\n",
Data::Dumper::Dumper( $results ),
"--------------------------\n";
} elsif ( $conf->{debug} & $SwishSearch::DEBUG_SUMMARY ) {
print STDERR "\n------------- Results Summary ------------\n";
if ( $results->{hits} ) {
require Data::Dumper;
print STDERR "Showing $results->{navigation}{showing} of $results->{navigation}{hits}\n",
Data::Dumper::Dumper( $results->{_results} );
} else {
print STDERR "** NO RESULTS **\n";
}
print STDERR "--------------------------\n";
} else {
print STDERR ( ($results->{hits} ? "Found $results->{hits} results\n" : "Failed to find any results\n" . $results->errstr . "\n" ),"\n" );
}
}
my $template = $conf->{template} || { package => 'TemplateDefault' };
my $package = $template->{package};
my $file = "$package.pm";
$file =~ s[::][/]g;
eval { require $file };
if ( $@ ) {
warn "$0 $@";
print <
Software Error
Software Error
Please check error log