/*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Analyzer = Lucene.Net.Analysis.Analyzer;
using Token = Lucene.Net.Analysis.Token;
using TokenStream = Lucene.Net.Analysis.TokenStream;
using Document = Lucene.Net.Documents.Document;
using Field = Lucene.Net.Documents.Field;
using Similarity = Lucene.Net.Search.Similarity;
using Directory = Lucene.Net.Store.Directory;
using OutputStream = Lucene.Net.Store.OutputStream;
namespace Lucene.Net.Index
{
sealed public class DocumentWriter
{
private Analyzer analyzer;
private Directory directory;
private Similarity similarity;
private FieldInfos fieldInfos;
private int maxFieldLength;
///
/// The directory to write the document information to
///
/// The analyzer to use for the document
///
/// The Similarity function
///
/// The maximum number of tokens a Field may have
///
public /*internal*/ DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength)
{
this.directory = directory;
this.analyzer = analyzer;
this.similarity = similarity;
this.maxFieldLength = maxFieldLength;
}
/*internal*/ public void AddDocument(System.String segment, Document doc)
{
// write Field names
fieldInfos = new FieldInfos();
fieldInfos.Add(doc);
fieldInfos.Write(directory, segment + ".fnm");
// write Field values
FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
try
{
fieldsWriter.AddDocument(doc);
}
finally
{
fieldsWriter.Close();
}
// invert doc into postingTable
postingTable.Clear(); // clear postingTable
fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
fieldPositions = new int[fieldInfos.Size()]; // init fieldPositions
fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
float boost = doc.GetBoost();
for (int i = 0; i < fieldBoosts.Length; i++)
{
fieldBoosts[i] = boost;
}
InvertDocument(doc);
// sort postingTable into an array
Posting[] postings = SortPostingTable();
/*
for (int i = 0; i < postings.length; i++) {
Posting posting = postings[i];
System.out.print(posting.term);
System.out.print(" freq=" + posting.freq);
System.out.print(" pos=");
System.out.print(posting.positions[0]);
for (int j = 1; j < posting.freq; j++)
System.out.print("," + posting.positions[j]);
System.out.println("");
}
*/
// write postings
WritePostings(postings, segment);
// write norms of indexed fields
WriteNorms(doc, segment);
}
// Keys are Terms, values are Postings.
// Used to buffer a document before it is written to the index.
private System.Collections.Hashtable postingTable = System.Collections.Hashtable.Synchronized(new System.Collections.Hashtable());
private int[] fieldLengths;
private int[] fieldPositions;
private float[] fieldBoosts;
// Tokenizes the fields of a document into Postings.
private void InvertDocument(Document doc)
{
foreach(Field field in doc.Fields())
{
System.String fieldName = field.Name();
int fieldNumber = fieldInfos.FieldNumber(fieldName);
int length = fieldLengths[fieldNumber]; // length of Field
int position = fieldPositions[fieldNumber]; // position in Field
if (field.IsIndexed())
{
if (!field.IsTokenized())
{
// un-tokenized Field
AddPosition(fieldName, field.StringValue(), position++);
length++;
}
else
{
System.IO.TextReader reader; // find or make Reader
if (field.ReaderValue() != null)
reader = field.ReaderValue();
else if (field.StringValue() != null)
reader = new System.IO.StringReader(field.StringValue());
else
throw new System.ArgumentException("Field must have either String or Reader value");
// Tokenize Field and add to postingTable
TokenStream stream = analyzer.TokenStream(fieldName, reader);
try
{
for (Token t = stream.Next(); t != null; t = stream.Next())
{
position += (t.GetPositionIncrement() - 1);
AddPosition(fieldName, t.TermText(), position++);
if (++length > maxFieldLength)
break;
}
}
finally
{
stream.Close();
}
}
fieldLengths[fieldNumber] = length; // save Field length
fieldPositions[fieldNumber] = position; // save Field position
fieldBoosts[fieldNumber] *= field.GetBoost();
}
}
}
private Term termBuffer = new Term("", ""); // avoid consing
private void AddPosition(System.String field, System.String text, int position)
{
termBuffer.Set(field, text);
Posting ti = (Posting) postingTable[termBuffer];
if (ti != null)
{
// word seen before
int freq = ti.freq;
if (ti.positions.Length == freq)
{
// positions array is full
int[] newPositions = new int[freq * 2]; // double size
int[] positions = ti.positions;
for (int i = 0; i < freq; i++)
// copy old positions to new
newPositions[i] = positions[i];
ti.positions = newPositions;
}
ti.positions[freq] = position; // add new position
ti.freq = freq + 1; // update frequency
}
else
{
// word not seen before
Term term = new Term(field, text, false);
postingTable[term] = new Posting(term, position);
}
}
private Posting[] SortPostingTable()
{
// copy postingTable into an array
Posting[] array = new Posting[postingTable.Count];
System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator();
for (int i = 0; postings.MoveNext(); i++)
{
array[i] = (Posting) postings.Current;
}
// sort the array
QuickSort(array, 0, array.Length - 1);
return array;
}
private static void QuickSort(Posting[] postings, int lo, int hi)
{
if (lo >= hi)
return ;
int mid = (lo + hi) / 2;
if (postings[lo].term.CompareTo(postings[mid].term) > 0)
{
Posting tmp = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp;
}
if (postings[mid].term.CompareTo(postings[hi].term) > 0)
{
Posting tmp = postings[mid];
postings[mid] = postings[hi];
postings[hi] = tmp;
if (postings[lo].term.CompareTo(postings[mid].term) > 0)
{
Posting tmp2 = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp2;
}
}
int left = lo + 1;
int right = hi - 1;
if (left >= right)
return ;
Term partition = postings[mid].term;
for (; ; )
{
while (postings[right].term.CompareTo(partition) > 0)
--right;
while (left < right && postings[left].term.CompareTo(partition) <= 0)
++left;
if (left < right)
{
Posting tmp = postings[left];
postings[left] = postings[right];
postings[right] = tmp;
--right;
}
else
{
break;
}
}
QuickSort(postings, lo, left);
QuickSort(postings, left + 1, hi);
}
private void WritePostings(Posting[] postings, System.String segment)
{
OutputStream freq = null, prox = null;
TermInfosWriter tis = null;
TermVectorsWriter termVectorWriter = null;
try
{
//open files for inverse index storage
freq = directory.CreateFile(segment + ".frq");
prox = directory.CreateFile(segment + ".prx");
tis = new TermInfosWriter(directory, segment, fieldInfos);
TermInfo ti = new TermInfo();
System.String currentField = null;
for (int i = 0; i < postings.Length; i++)
{
Posting posting = postings[i];
// add an entry to the dictionary with pointers to prox and freq files
ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1);
tis.Add(posting.term, ti);
// add an entry to the freq file
int postingFreq = posting.freq;
if (postingFreq == 1)
// optimize freq=1
freq.WriteVInt(1);
// set low bit of doc num.
else
{
freq.WriteVInt(0); // the document number
freq.WriteVInt(postingFreq); // frequency in doc
}
int lastPosition = 0; // write positions
int[] positions = posting.positions;
for (int j = 0; j < postingFreq; j++)
{
// use delta-encoding
int position = positions[j];
prox.WriteVInt(position - lastPosition);
lastPosition = position;
}
// check to see if we switched to a new Field
System.String termField = posting.term.Field();
if (currentField != termField)
{
// changing Field - see if there is something to save
currentField = termField;
FieldInfo fi = fieldInfos.FieldInfo(currentField);
if (fi.storeTermVector)
{
if (termVectorWriter == null)
{
termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
termVectorWriter.OpenDocument();
}
termVectorWriter.OpenField(currentField);
}
else if (termVectorWriter != null)
{
termVectorWriter.CloseField();
}
}
if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
{
termVectorWriter.AddTerm(posting.term.Text(), postingFreq);
}
}
if (termVectorWriter != null)
termVectorWriter.CloseDocument();
}
finally
{
// make an effort to close all streams we can but remember and re-throw
// the first exception encountered in this process
System.IO.IOException keep = null;
if (freq != null)
try
{
freq.Close();
}
catch (System.IO.IOException e)
{
if (keep == null)
keep = e;
}
if (prox != null)
try
{
prox.Close();
}
catch (System.IO.IOException e)
{
if (keep == null)
keep = e;
}
if (tis != null)
try
{
tis.Close();
}
catch (System.IO.IOException e)
{
if (keep == null)
keep = e;
}
if (termVectorWriter != null)
try
{
termVectorWriter.Close();
}
catch (System.IO.IOException e)
{
if (keep == null)
keep = e;
}
if (keep != null)
{
throw new System.IO.IOException(keep.StackTrace);
}
}
}
private void WriteNorms(Document doc, System.String segment)
{
for (int n = 0; n < fieldInfos.Size(); n++)
{
FieldInfo fi = fieldInfos.FieldInfo(n);
if (fi.isIndexed)
{
float norm = fieldBoosts[n] * similarity.LengthNorm(fi.name, fieldLengths[n]);
OutputStream norms = directory.CreateFile(segment + ".f" + n);
try
{
norms.WriteByte(Lucene.Net.Search.Similarity.EncodeNorm(norm));
}
finally
{
norms.Close();
}
}
}
}
}
sealed class Posting
{
// info about a Term in a doc
internal Term term; // the Term
internal int freq; // its frequency in doc
internal int[] positions; // positions it occurs at
internal Posting(Term t, int position)
{
term = t;
freq = 1;
positions = new int[1];
positions[0] = position;
}
}
}