#!/usr/bin/env ruby
#
# ff - Search and index document files using Ferret
#
# Author: Stuart Rackham <srackham@methods.co.nz>
# License: This source code is released under the MIT license.
# Home page: http://www.methods.co.nz/ff/
#
# Requisites:
# - Ferret 0.10.4 or better installed as a Ruby Gem.
# See http://ferret.davebalmain.com/trac for Ferret installation.
# - The accompanying ferret_helper.rb file.
# - External text file converters documented in ferret_helper.rb file.
#
# Installation:
# Drop this file and the accompanying ferret_helper.rb file into your search
# $PATH. Indexes are stored in a sub-directory called ff_index in this file's
# directory.
# Check the shebang line is right for your system.
#
HELP = <<EOF
NAME
ff - Search and index document files using Ferret
SYNOPSIS
ff -i [OPTIONS] DIRECTORY...
ff WORD...
ff QUERY
OPTIONS
--version
Print program version number
-h, --help
Print this message
-I, --include WILDCARD
Only include file paths matching WILDCARDS in index (may be repeated)
-x, --exclude WILDCARD
Exclude file paths matching WILDCARDS from index (may be repeated)
DESCRIPTION
The first form recursively indexes all indexable document types in DIRECTORYs
(defaults to current directory). Currently accepted document types are PDF,
HTML, plain text and Microsoft Word. A document's format is determined by
checking it's MIME type (determined by file(1)) and file extension -- both
must be acceptable for the document to be indexed.
The Ferret index will be created in a directory called `ff_index` in the
directory containing the `ff.rb` script.
The second form lists files containing all WORDs (highest score first).
The third form lists files satisfying the Ferret Query Language query QUERY
(highest score first).
The last form prints the program version number to stdout and exits.
This help message is printed if there are no arguments or there is a -h or
--help option.
PREREQUISITES
Requires file(1) to calculate MIME types. Requires pdftotext(1),
html2text(1), antiword(1) and odt2txt(1) to index PDF, HTML, Microsoft Word
and Open Document documents respectively.
EOF
# HISTORY
# 1.1.1: 2007-01-27:
# - Added --include and --exclude options.
# 1.1.0: 2006-09-09:
# - Rewrite for Ferret 0.10.x compatibility.
# 1.0.4: 2006-05-22:
# - Fixed bug in FerretHelper.file_mime_type
# - Fixed documentation errors in ferret_helper.rb
# 1.0.3: 2006-05-15:
# - Don't assume ASCII input stream, fall back to ASCII encoder if
# input stream does not conform to the default encoding (determined
# by the locale).
# 1.0.2: 2006-04-17:
# - Strip non-ascii characters before indexing.
# 1.0.1: 2006-04-12:
# - Summary by file type.
# - Store absolute path name.
# 1.0.0: 2006-04-06: First release.
#
VERS= '1.1.1'
require 'pathname'
require 'rubygems'
require 'ferret'
include Ferret
begin
require 'ferret_helper'
rescue LoadError
# Try this file's directory.
require File.join(File.dirname(Pathname.new(__FILE__).realpath),'ferret_helper')
end
include FerretHelper
# Limit the number of documents found to this number.
NUM_DOCS = 1000
# The Ferret index directory is created in the same directory as this file.
INDEX_DIR = File.join(File.dirname(Pathname.new(__FILE__).realpath),'ff_index')
# Processed mime types and their usual file extensions.
MIME_TYPE_FILE_EXTENSIONS = {
'application/msword' => ['.doc'],
'application/pdf' => ['.pdf'],
'application/vnd.oasis.opendocument.text' => ['.odt'],
'text/html' => ['.html','.htm'],
'text/plain' => ['.txt'],
}
# Add file +filename+ to the +index+.
def index_file(index, filename, mime_type)
text = convert_to_text_string(filename, mime_type)
raise "empty document #{filename}" if text.strip.empty?
fields = {}
fields[:file] = File.expand_path(filename)
fields[:content] = text
index << fields
end
# Recursively add all qualifying files in directory +dir+ to +index+.
def index_directory(index, dir, excludes, includes, counters)
# Only visit files with allowed extensions.
pat = "**/*{#{MIME_TYPE_FILE_EXTENSIONS.values.flatten.join(',')}}"
Dir.glob(File.join(dir, pat), File::FNM_CASEFOLD) do |filename|
add = (includes.empty? or includes.any? { |m| File.fnmatch(m, filename, File::FNM_DOTMATCH) })
if add
add = (not excludes.any? { |m| File.fnmatch(m, filename, File::FNM_DOTMATCH) })
end
# Skip files in Darcs repositories or hidden directories.
if add and File.file?(filename) and not filename =~ /.*\/(_darcs|\..+?)\/.*/
begin
$stderr.puts "indexing: #{filename}"
# Trying to guess MIME type from file contents is not reliable for text
# files. The strategy used here is to infer from file name extension
# and rely on the convertor routine to fail if type is incorrect.
mime_type = filename_mime_type(filename)
index_file(index, filename, mime_type)
rescue => e
$stderr.puts "skipped: #{e.message}"
counters[mime_type].skipped += 1
else
counters[mime_type].size += File.size(filename)
counters[mime_type].count += 1
end
end
end
end
def create_index(dirs, excludes, includes)
Dir.mkdir INDEX_DIR unless File.directory?(INDEX_DIR)
index = Index::IndexWriter.new(:create => true, :path => INDEX_DIR)
# Although not intuitively obvious, until I tokenized the file name, wildcard
# file name searches did not return all matching documents.
index.field_infos.add_field(:file, :store => :yes, :index => :yes)
index.field_infos.add_field(:content, :store => :no, :index => :yes)
Struct.new('Counter', :size, :count, :skipped)
counters = {}
MIME_TYPE_FILE_EXTENSIONS.each_key do |key|
counters[key] = Struct::Counter.new(0,0,0)
end
begin
dirs.each { |dir| index_directory(index, dir, excludes, includes, counters) }
index.optimize
ensure
index.close
end
counters.each_pair do |key,value|
$stderr.puts "\n#{key}:"
$stderr.puts "files indexed: #{value.count} (#{value.size} bytes)"
$stderr.puts "files skipped: #{value.skipped}" unless value.skipped.zero?
end
total_count = counters.values.inject(0) {|sum,count| sum + count.count}
total_size = counters.values.inject(0) {|sum,count| sum + count.size}
total_skipped = counters.values.inject(0) {|sum,count| sum + count.skipped}
$stderr.puts "\ntotal files indexed: #{total_count} (#{total_size} bytes)"
$stderr.puts "total files skipped: #{total_skipped}" unless total_skipped.zero?
end
def search_index(args)
query_parser = QueryParser.new(:default_field => :content,
:or_default => false)
query = query_parser.parse(args.join(' ').downcase)
index = Index::Index.new(:path => INDEX_DIR)
count = 0
begin
index.search_each(query, :limit => NUM_DOCS) do |doc, score|
puts index[doc][:file]
#puts "#{score}: #{index[doc][:file]}"
=begin Prints highlighted excerpts (but need to store content in index to work).
index.highlight(query, doc,
:field => :content, :excerpt_length => 60,
:pre_tag => "\033[7m", :post_tag => "\033[m"
).each { |s| puts s; puts }
=end
count += 1
end
ensure
index.close
end
$stderr.puts
$stderr.puts "query: #{query}"
$stderr.puts "files: #{count}"
end
def main
require 'optparse'
index = false
excludes = []
includes = []
opts = OptionParser.new do |opts|
opts.on '-h', '--help' do
puts HELP
exit
end
opts.on '--version' do
puts "ff #{VERS}"
exit
end
opts.on '-i', '--index' do
index = true
end
opts.on '-x', '--exclude WILDCARD' do |wildcard|
excludes << wildcard
end
opts.on '-I', '--include WILDCARD' do |wildcard|
includes << wildcard
end
end
opts.parse! ARGV
start = Time.now
if index
create_index ARGV, excludes, includes
else
if ARGV.empty?
$stderr.puts 'missing search query arguments'
exit 1
end
search_index ARGV
end
$stderr.puts "time: #{Time.now - start} seconds"
end
if __FILE__ == $0
main
end