lucene-2.9.4/ 0000755 0001750 0001750 00000000000 11554106561 013457 5 ustar janpascal janpascal lucene-2.9.4/lib/ 0000755 0001750 0001750 00000000000 11554106562 014226 5 ustar janpascal janpascal lucene-2.9.4/contrib/ 0000755 0001750 0001750 00000000000 11554106561 015117 5 ustar janpascal janpascal lucene-2.9.4/contrib/ant/ 0000755 0001750 0001750 00000000000 11554106561 015701 5 ustar janpascal janpascal lucene-2.9.4/contrib/ant/lib/ 0000755 0001750 0001750 00000000000 11554106562 016450 5 ustar janpascal janpascal lucene-2.9.4/contrib/ant/build.xml 0000644 0001750 0001750 00000002645 11474320237 017530 0 ustar janpascal janpascal
* * The document has a single field: *
contents
--containing the full contents
* of the file, as a Text field;
*
*@param f Description of Parameter
*@return Description of the Returned Value
*@exception IOException Description of Exception
*/
public static Document Document(File f) throws IOException {
TextDocument textDoc = new TextDocument(f);
// make a new, empty document
Document doc = new Document();
doc.add(new Field("title", f.getName(), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("contents", textDoc.getContents(), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("rawcontents", textDoc.getContents(), Field.Store.YES, Field.Index.NO));
// return the document
return doc;
}
/**
*@return The contents value
*@todo finish this method
*/
public String getContents() {
return contents;
}
}
lucene-2.9.4/contrib/ant/src/java/org/apache/lucene/ant/DocumentHandlerException.java 0000644 0001750 0001750 00000004164 11474320237 031260 0 ustar janpascal janpascal package org.apache.lucene.ant;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.PrintStream;
import java.io.PrintWriter;
public class DocumentHandlerException extends Exception {
private Throwable cause;
/**
* Default constructor.
*/
public DocumentHandlerException() {
super();
}
/**
* Constructs with message.
*/
public DocumentHandlerException(String message) {
super(message);
}
/**
* Constructs with chained exception.
*/
public DocumentHandlerException(Throwable cause) {
super(cause.toString());
this.cause = cause;
}
/**
* Retrieves nested exception.
*/
public Throwable getException() {
return cause;
}
public void printStackTrace() {
printStackTrace(System.err);
}
public void printStackTrace(PrintStream ps) {
synchronized (ps) {
super.printStackTrace(ps);
if (cause != null) {
ps.println("--- Nested Exception ---");
cause.printStackTrace(ps);
}
}
}
public void printStackTrace(PrintWriter pw) {
synchronized (pw) {
super.printStackTrace(pw);
if (cause != null) {
pw.println("--- Nested Exception ---");
cause.printStackTrace(pw);
}
}
}
}
lucene-2.9.4/contrib/ant/src/java/org/apache/lucene/ant/HtmlDocument.java 0000644 0001750 0001750 00000022334 11474320237 026727 0 ustar janpascal janpascal package org.apache.lucene.ant;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.Field;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.w3c.tidy.Tidy;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
/**
* The HtmlDocument
class creates a Lucene {@link
* org.apache.lucene.document.Document} from an HTML document.
*
* It does this by using JTidy package. It can take input input
* from {@link java.io.File} or {@link java.io.InputStream}.
*
*/
public class HtmlDocument {
private Element rawDoc;
//-------------------------------------------------------------
// Constructors
//-------------------------------------------------------------
/**
* Constructs an HtmlDocument
from a {@link
* java.io.File}.
*
*@param file the File
containing the
* HTML to parse
*@exception IOException if an I/O exception occurs
*/
public HtmlDocument(File file) throws IOException {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root = null;
InputStream is = new FileInputStream(file);
try {
root = tidy.parseDOM(is, null);
} finally {
is.close();
}
rawDoc = root.getDocumentElement();
}
/**
* Constructs an HtmlDocument
from an {@link
* java.io.InputStream}.
*
*@param is the InputStream
* containing the HTML
*/
public HtmlDocument(InputStream is) {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root = tidy.parseDOM(is, null);
rawDoc = root.getDocumentElement();
}
/**
* Constructs an HtmlDocument
from a
* {@link java.io.File}.
* @param file the File
containing the
* HTML to parse
* @param tidyConfigFile the String
* containing the full path to the Tidy config file
* @exception IOException if an I/O exception occurs */
public HtmlDocument(File file, String tidyConfigFile) throws IOException {
Tidy tidy = new Tidy();
tidy.setConfigurationFromFile(tidyConfigFile);
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root =
tidy.parseDOM(new FileInputStream(file), null);
rawDoc = root.getDocumentElement();
}
/**
* Creates a Lucene Document
from a
* {@link java.io.File}.
* @param file
* @param tidyConfigFile the full path to the Tidy
* config file
* @exception IOException */
public static org.apache.lucene.document.Document
Document(File file, String tidyConfigFile) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(file, tidyConfigFile);
org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.ANALYZED));
String contents = null;
BufferedReader br =
new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
while (line != null) {
sw.write(line);
line = br.readLine();
}
br.close();
contents = sw.toString();
sw.close();
luceneDoc.add(new Field("rawcontents", contents, Field.Store.YES, Field.Index.NO));
return luceneDoc;
}
/**
* Creates a Lucene Document
from an {@link
* java.io.InputStream}.
*
*@param is
*/
public static org.apache.lucene.document.Document
getDocument(InputStream is) {
HtmlDocument htmlDoc = new HtmlDocument(is);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.ANALYZED));
return luceneDoc;
}
//-------------------------------------------------------------
// Public methods
//-------------------------------------------------------------
/**
* Creates a Lucene Document
from a {@link
* java.io.File}.
*
*@param file
*@exception IOException
*/
public static org.apache.lucene.document.Document
Document(File file) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(file);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.ANALYZED));
String contents = null;
BufferedReader br =
new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
while (line != null) {
sw.write(line);
line = br.readLine();
}
br.close();
contents = sw.toString();
sw.close();
luceneDoc.add(new Field("rawcontents", contents, Field.Store.YES, Field.Index.NO));
return luceneDoc;
}
//-------------------------------------------------------------
// Private methods
//-------------------------------------------------------------
/**
* Runs HtmlDocument
on the files specified on
* the command line.
*
*@param args Command line arguments
*@exception Exception Description of Exception
*/
public static void main(String args[]) throws Exception {
// HtmlDocument doc = new HtmlDocument(new File(args[0]));
// System.out.println("Title = " + doc.getTitle());
// System.out.println("Body = " + doc.getBody());
HtmlDocument doc =
new HtmlDocument(new FileInputStream(new File(args[0])));
System.out.println("Title = " + doc.getTitle());
System.out.println("Body = " + doc.getBody());
}
/**
* Gets the title attribute of the HtmlDocument
* object.
*
*@return the title value
*/
public String getTitle() {
if (rawDoc == null) {
return null;
}
String title = "";
NodeList nl = rawDoc.getElementsByTagName("title");
if (nl.getLength() > 0) {
Element titleElement = ((Element) nl.item(0));
Text text = (Text) titleElement.getFirstChild();
if (text != null) {
title = text.getData();
}
}
return title;
}
/**
* Gets the bodyText attribute of the
* HtmlDocument
object.
*
*@return the bodyText value
*/
public String getBody() {
if (rawDoc == null) {
return null;
}
String body = "";
NodeList nl = rawDoc.getElementsByTagName("body");
if (nl.getLength() > 0) {
body = getBodyText(nl.item(0));
}
return body;
}
/**
* Gets the bodyText attribute of the
* HtmlDocument
object.
*
*@param node a DOM Node
*@return The bodyText value
*/
private String getBodyText(Node node) {
NodeList nl = node.getChildNodes();
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < nl.getLength(); i++) {
Node child = nl.item(i);
switch (child.getNodeType()) {
case Node.ELEMENT_NODE:
buffer.append(getBodyText(child));
buffer.append(" ");
break;
case Node.TEXT_NODE:
buffer.append(((Text) child).getData());
break;
}
}
return buffer.toString();
}
}
lucene-2.9.4/contrib/ant/src/java/overview.html 0000644 0001750 0001750 00000001661 11474320237 022150 0 ustar janpascal janpascal