/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Diagnostics;
using System.IO;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Index;
using FSDirectory = Lucene.Net.Store.FSDirectory;
using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Demo
{
/// Indexer for HTML files.
public static class IndexHTML
{
/// Indexer for HTML files.
[STAThread]
public static void Main(System.String[] argv)
{
try
{
var index = new DirectoryInfo("index");
bool create = false;
DirectoryInfo root = null;
var usage = "IndexHTML [-create] [-index ] ";
if (argv.Length == 0)
{
Console.Error.WriteLine("Usage: " + usage);
return ;
}
for (int i = 0; i < argv.Length; i++)
{
if (argv[i].Equals("-index"))
{
// parse -index option
index = new DirectoryInfo(argv[++i]);
}
else if (argv[i].Equals("-create"))
{
// parse -create option
create = true;
}
else if (i != argv.Length - 1)
{
Console.Error.WriteLine("Usage: " + usage);
return ;
}
else
root = new DirectoryInfo(argv[i]);
}
if (root == null)
{
Console.Error.WriteLine("Specify directory to index");
Console.Error.WriteLine("Usage: " + usage);
return ;
}
var start = DateTime.Now;
using (var writer = new IndexWriter(FSDirectory.Open(index), new StandardAnalyzer(Version.LUCENE_30), create, new IndexWriter.MaxFieldLength(1000000)))
{
if (!create)
{
// We're not creating a new index, iterate our index and remove
// any stale documents.
IndexDocs(writer, root, index, Operation.RemoveStale);
}
var operation = create
? Operation.CompleteReindex
: Operation.IncrementalReindex;
IndexDocs(writer, root, index, operation); // add new docs
Console.Out.WriteLine("Optimizing index...");
writer.Optimize();
writer.Commit();
}
var end = DateTime.Now;
Console.Out.Write(end.Millisecond - start.Millisecond);
Console.Out.WriteLine(" total milliseconds");
}
catch (Exception e)
{
Console.Error.WriteLine(e.StackTrace);
}
}
/* Walk directory hierarchy in uid order, while keeping uid iterator from
/* existing index in sync. Mismatches indicate one of: (a) old documents to
/* be deleted; (b) unchanged documents, to be left alone; or (c) new
/* documents, to be indexed.
*/
private static void IndexDocs(IndexWriter writer, DirectoryInfo file, DirectoryInfo index, Operation operation)
{
if (operation == Operation.CompleteReindex)
{
// Perform a full reindexing.
IndexDirectory(writer, null, file, operation);
}
else
{
// Perform an incremental reindexing.
using (var reader = IndexReader.Open(FSDirectory.Open(index), true)) // open existing index
using (var uidIter = reader.Terms(new Term("uid", ""))) // init uid iterator
{
IndexDirectory(writer, uidIter, file, operation);
if (operation == Operation.RemoveStale) {
// Delete remaining, presumed stale, documents. This works since
// the above call to IndexDirectory should have positioned the uidIter
// after any uids matching existing documents. Any remaining uid
// is remains from documents that has been deleted since they was
// indexed.
while (uidIter.Term != null && uidIter.Term.Field == "uid") {
Console.Out.WriteLine("deleting " + HTMLDocument.Uid2url(uidIter.Term.Text));
writer.DeleteDocuments(uidIter.Term);
uidIter.Next();
}
}
}
}
}
private static void IndexDirectory(IndexWriter writer, TermEnum uidIter, DirectoryInfo dir, Operation operation) {
var entries = Directory.GetFileSystemEntries(dir.FullName);
// Sort the entries. This is important, the uidIter TermEnum is
// iterated in a forward-only fashion, requiring all files to be
// passed in ascending order.
Array.Sort(entries);
foreach (var entry in entries) {
var path = Path.Combine(dir.FullName, entry);
if (Directory.Exists(path)) {
IndexDirectory(writer, uidIter, new DirectoryInfo(path), operation);
} else if (File.Exists(path)) {
IndexFile(writer, uidIter, new FileInfo(path), operation);
}
}
}
private static void IndexFile(IndexWriter writer, TermEnum uidIter, FileInfo file, Operation operation)
{
if (file.FullName.EndsWith(".html") || file.FullName.EndsWith(".htm") || file.FullName.EndsWith(".txt"))
{
// We've found a file we should index.
if (operation == Operation.IncrementalReindex ||
operation == Operation.RemoveStale)
{
// We should only get here with an open uidIter.
Debug.Assert(uidIter != null, "Expected uidIter != null for operation " + operation);
var uid = HTMLDocument.Uid(file); // construct uid for doc
while (uidIter.Term != null && uidIter.Term.Field == "uid" && String.CompareOrdinal(uidIter.Term.Text, uid) < 0)
{
if (operation == Operation.RemoveStale)
{
Console.Out.WriteLine("deleting " + HTMLDocument.Uid2url(uidIter.Term.Text));
writer.DeleteDocuments(uidIter.Term);
}
uidIter.Next();
}
// The uidIter TermEnum should now be pointing at either
// 1) a null term, meaning there are no more uids to check.
// 2) a term matching the current file.
// 3) a term not matching us.
if (uidIter.Term != null && uidIter.Term.Field == "uid" && String.CompareOrdinal(uidIter.Term.Text, uid) == 0)
{
// uidIter points to the current document, we should move one
// step ahead to keep state consistant, and carry on.
uidIter.Next();
}
else if (operation == Operation.IncrementalReindex)
{
// uidIter does not point to the current document, and we're
// currently indexing documents.
var doc = HTMLDocument.Document(file);
Console.Out.WriteLine("adding " + doc.Get("path"));
writer.AddDocument(doc);
}
}
else
{
// We're doing a complete reindexing. We aren't using uidIter,
// but for completeness we assert that it's null (as expected).
Debug.Assert(uidIter == null, "Expected uidIter == null for operation == " + operation);
var doc = HTMLDocument.Document(file);
Console.Out.WriteLine("adding " + doc.Get("path"));
writer.AddDocument(doc);
}
}
}
private enum Operation {
///
/// Indicates an incremental indexing.
///
IncrementalReindex,
///
/// Indicates that stale entries in the index should be removed.
///
RemoveStale,
///
/// Indicates an complete reindexing.
///
CompleteReindex
}
}
}