boilerpipe-1.2.0/src/ 40755 0 0 0 11502365053 11465 5ustar 0 0 boilerpipe-1.2.0/src/demo/ 40755 0 0 0 11502365053 12411 5ustar 0 0 boilerpipe-1.2.0/src/demo/de/ 40755 0 0 0 11502365053 13001 5ustar 0 0 boilerpipe-1.2.0/src/demo/de/l3s/ 40755 0 0 0 11502365053 13502 5ustar 0 0 boilerpipe-1.2.0/src/demo/de/l3s/boilerpipe/ 40755 0 0 0 11502365053 15634 5ustar 0 0 boilerpipe-1.2.0/src/demo/de/l3s/boilerpipe/demo/ 40755 0 0 0 11502365053 16560 5ustar 0 0 boilerpipe-1.2.0/src/main/ 40755 0 0 0 11502365056 12414 5ustar 0 0 boilerpipe-1.2.0/src/main/de/ 40755 0 0 0 11502365053 13001 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/ 40755 0 0 0 11502365053 13502 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/ 40755 0 0 0 11502365056 15637 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/boilers/ 40755 0 0 0 11605072057 17276 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/conditions/ 40755 0 0 0 11502365054 20006 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/document/ 40755 0 0 0 11502365054 17453 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/estimators/ 40755 0 0 0 11502365054 20027 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/ 40755 0 0 0 11605072074 20034 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/ 40755 0 0 0 11502365055 17306 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/english/ 40755 0 0 0 11605070255 20736 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/heuristics/ 40755 0 0 0 11502365055 21470 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/simple/ 40755 0 0 0 11502365055 20577 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/labels/ 40755 0 0 0 11502365055 17100 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/ 40755 0 0 0 11502365056 16432 5ustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/util/ 40755 0 0 0 11502365056 16614 5ustar 0 0 boilerpipe-1.2.0/src/main/org/ 40755 0 0 0 11502365056 13203 5ustar 0 0 boilerpipe-1.2.0/src/main/org/cyberneko/ 40755 0 0 0 11502365056 15164 5ustar 0 0 boilerpipe-1.2.0/src/main/org/cyberneko/html/ 40755 0 0 0 11502365056 16130 5ustar 0 0 boilerpipe-1.2.0/.classpath100644 0 0 743 11502365045 12743 0ustar 0 0 boilerpipe-1.2.0/.project100644 0 0 566 11502365045 12432 0ustar 0 0 boilerpipe-core org.eclipse.jdt.core.javabuilder org.eclipse.jdt.core.javanature boilerpipe-1.2.0/LICENSE.txt100644 0 0 1164 11570206520 12616 0ustar 0 0 boilerpipe Copyright (c) 2009-2011 Christian Kohlschütter The author licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. boilerpipe-1.2.0/NOTICE.txt100644 0 0 1443 11570206530 12516 0ustar 0 0 boilerpipe Copyright (c) 2009-2011 Christian Kohlschütter The author licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. This software contains the following parts which are also provided under the Apache License 2.0 (http://apache.org/licenses/LICENSE-2.0.txt): - NekoHTML - Xerces boilerpipe-1.2.0/build.xml100644 0 0 15172 11570206451 12643 0ustar 0 0 ${app.name} ${app.title} ${app.version} yes no Christian Kohlschütter boilerpipe-1.2.0/src/demo/de/l3s/boilerpipe/demo/HTMLHighlightDemo.java100644 0 0 3114 11570206250 22716 0ustar 0 0 package de.l3s.boilerpipe.demo; import java.io.PrintWriter; import java.net.URL; import de.l3s.boilerpipe.BoilerpipeExtractor; import de.l3s.boilerpipe.extractors.CommonExtractors; import de.l3s.boilerpipe.sax.HTMLHighlighter; /** * Demonstrates how to use Boilerpipe to get the main content, highlighted as HTML. * * @author Christian Kohlschütter * @see Oneliner if you only need the plain text. */ public class HTMLHighlightDemo { public static void main(String[] args) throws Exception { URL url = new URL( "http://research.microsoft.com/en-us/um/people/ryenw/hcir2010/challenge.html" // "http://boilerpipe-web.appspot.com/" ); // choose from a set of useful BoilerpipeExtractors... final BoilerpipeExtractor extractor = CommonExtractors.ARTICLE_EXTRACTOR; // final BoilerpipeExtractor extractor = CommonExtractors.DEFAULT_EXTRACTOR; // final BoilerpipeExtractor extractor = CommonExtractors.CANOLA_EXTRACTOR; // final BoilerpipeExtractor extractor = CommonExtractors.LARGEST_CONTENT_EXTRACTOR; // choose the operation mode (i.e., highlighting or extraction) final HTMLHighlighter hh = HTMLHighlighter.newHighlightingInstance(); // final HTMLHighlighter hh = HTMLHighlighter.newExtractingInstance(); PrintWriter out = new PrintWriter("/tmp/highlighted.html", "UTF-8"); out.println(""); out.println(""); out.println(hh.process(url, extractor)); out.close(); System.out.println("Now open file:///tmp/highlighted.html in your web browser"); } } boilerpipe-1.2.0/src/demo/de/l3s/boilerpipe/demo/Oneliner.java100644 0 0 3241 11506127045 21274 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.demo; import java.net.URL; import de.l3s.boilerpipe.extractors.ArticleExtractor; /** * Demonstrates how to use Boilerpipe to get the main content as plain text. * Note: In real-world cases, you'd probably want to download the file first using a fault-tolerant crawler. * * @author Christian Kohlschütter * @see HTMLHighlightDemo if you need HTML as well. */ public class Oneliner { public static void main(final String[] args) throws Exception { final URL url = new URL( // "http://www.l3s.de/web/page11g.do?sp=page11g&link=ln104g&stu1g.LanguageISOCtxParam=en" "http://www.dn.se/nyheter/vetenskap/annu-godare-choklad-med-hjalp-av-dna-teknik" ); // This can also be done in one line: System.out.println(ArticleExtractor.INSTANCE.getText(url)); // Also try other extractors! // System.out.println(DefaultExtractor.INSTANCE.getText(url)); // System.out.println(CommonExtractors.CANOLA_EXTRACTOR.getText(url)); } } boilerpipe-1.2.0/src/demo/de/l3s/boilerpipe/demo/UsingSAX.java100644 0 0 3261 11502365053 21163 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.demo; import java.net.URL; import org.xml.sax.InputSource; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.extractors.ArticleExtractor; import de.l3s.boilerpipe.sax.BoilerpipeSAXInput; import de.l3s.boilerpipe.sax.HTMLFetcher; /** * Demonstrates how to use Boilerpipe when working with {@link InputSource}s. * * @author Christian Kohlschütter */ public class UsingSAX { public static void main(final String[] args) throws Exception { URL url; url = new URL( "http://www.l3s.de/web/page11g.do?sp=page11g&link=ln104g&stu1g.LanguageISOCtxParam=en"); final InputSource is = HTMLFetcher.fetch(url).toInputSource(); final BoilerpipeSAXInput in = new BoilerpipeSAXInput(is); final TextDocument doc = in.getTextDocument(); // You have the choice between different Extractors // System.out.println(DefaultExtractor.INSTANCE.getText(doc)); System.out.println(ArticleExtractor.INSTANCE.getText(doc)); } } boilerpipe-1.2.0/src/demo/de/l3s/boilerpipe/demo/package.html100644 0 0 135 11502365053 21115 0ustar 0 0

Just some puppy demo (contained in boilerpipe-demo.jar)

boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/BoilerpipeDocumentSource.java100644 0 0 413 11605070144 23522 0ustar 0 0 package de.l3s.boilerpipe; import de.l3s.boilerpipe.document.TextDocument; /** * Something that can be represented as a {@link TextDocument}. */ public interface BoilerpipeDocumentSource { TextDocument toTextDocument() throws BoilerpipeProcessingException; } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/BoilerpipeExtractor.java100644 0 0 3142 11502365054 22563 0ustar 0 0 package de.l3s.boilerpipe; import java.io.Reader; import org.xml.sax.InputSource; import de.l3s.boilerpipe.document.TextDocument; /** * Describes a complete filter pipeline. * * @author Christian Kohlschütter */ public interface BoilerpipeExtractor extends BoilerpipeFilter { /** * Extracts text from the HTML code given as a String. * * @param html * The HTML code as a String. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final String html) throws BoilerpipeProcessingException; /** * Extracts text from the HTML code available from the given * {@link InputSource}. * * @param is * The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException; /** * Extracts text from the HTML code available from the given {@link Reader}. * * @param r * The Reader containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final Reader r) throws BoilerpipeProcessingException; /** * Extracts text from the given {@link TextDocument} object. * * @param doc * The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException; } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/BoilerpipeFilter.java100644 0 0 2453 11502365054 22041 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe; import de.l3s.boilerpipe.document.TextDocument; /** * A generic {@link BoilerpipeFilter}. Takes a {@link TextDocument} and * processes it somehow. * * @author Christian Kohlschütter */ public interface BoilerpipeFilter { /** * Processes the given document doc. * * @param doc * The {@link TextDocument} that is to be processed. * @return true if changes have been made to the * {@link TextDocument}. * @throws BoilerpipeProcessingException */ boolean process(final TextDocument doc) throws BoilerpipeProcessingException; } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/BoilerpipeInput.java100644 0 0 2112 11502365054 21703 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe; import de.l3s.boilerpipe.document.TextDocument; /** * A source that returns {@link TextDocument}s. * * @author Christian Kohlschütter */ public interface BoilerpipeInput { /** * Returns (somehow) a {@link TextDocument}. * * @return A {@link TextDocument}. * @throws BoilerpipeProcessingException */ TextDocument getTextDocument() throws BoilerpipeProcessingException; } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/BoilerpipeProcessingException.java100644 0 0 2374 11502365054 24611 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe; /** * Exception for signaling failure in the processing pipeline. * * @author Christian Kohlschütter */ public class BoilerpipeProcessingException extends Exception { private static final long serialVersionUID = 1L; public BoilerpipeProcessingException() { super(); } public BoilerpipeProcessingException(String message, Throwable cause) { super(message, cause); } public BoilerpipeProcessingException(String message) { super(message); } public BoilerpipeProcessingException(Throwable cause) { super(cause); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/conditions/TextBlockCondition.java100644 0 0 2372 11502365054 24520 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009, 2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.conditions; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.labels.ConditionalLabelAction; /** * Evaluates whether a given {@link TextBlock} meets a certain condition. * Useful in combination with {@link ConditionalLabelAction}. * * @author Christian Kohlschuetter */ public interface TextBlockCondition { /** * Returns true iff the given {@link TextBlock} tb meets the defined condition. * * @param tb * @return iff the condition is met. */ boolean meetsCondition(final TextBlock tb); } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/document/TextBlock.java100644 0 0 20047 11502365054 22335 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.document; import java.util.BitSet; import java.util.HashSet; import java.util.Set; import de.l3s.boilerpipe.labels.DefaultLabels; /** * Describes a block of text. * * A block can be an "atomic" text element (i.e., a sequence of text that is not * interrupted by any HTML markup) or a compound of such atomic elements. * * @author Christian Kohlschütter */ public class TextBlock implements Cloneable { boolean isContent = false; private CharSequence text; Set labels = null; int offsetBlocksStart; int offsetBlocksEnd; int numWords; int numWordsInAnchorText; int numWordsInWrappedLines; int numWrappedLines; float textDensity; float linkDensity; BitSet containedTextElements; private int numFullTextWords = 0; private int tagLevel; private static final BitSet EMPTY_BITSET = new BitSet(); public static final TextBlock EMPTY_START = new TextBlock("", EMPTY_BITSET, 0, 0, 0, 0, -1); public static final TextBlock EMPTY_END = new TextBlock("", EMPTY_BITSET, 0, 0, 0, 0, Integer.MAX_VALUE); public TextBlock(final String text) { this(text, null, 0,0,0,0,0); } public TextBlock(final String text, final BitSet containedTextElements, final int numWords, final int numWordsInAnchorText, final int numWordsInWrappedLines, final int numWrappedLines, final int offsetBlocks) { this.text = text; this.containedTextElements = containedTextElements; this.numWords = numWords; this.numWordsInAnchorText = numWordsInAnchorText; this.numWordsInWrappedLines = numWordsInWrappedLines; this.numWrappedLines = numWrappedLines; this.offsetBlocksStart = offsetBlocks; this.offsetBlocksEnd = offsetBlocks; initDensities(); } public boolean isContent() { return isContent; } public boolean setIsContent(boolean isContent) { if (isContent != this.isContent) { this.isContent = isContent; return true; } else { return false; } } public String getText() { return text.toString(); } public int getNumWords() { return numWords; } public int getNumWordsInAnchorText() { return numWordsInAnchorText; } public float getTextDensity() { return textDensity; } public float getLinkDensity() { return linkDensity; } public void mergeNext(final TextBlock other) { if (!(text instanceof StringBuilder)) { text = new StringBuilder(text); } StringBuilder sb = (StringBuilder) text; sb.append('\n'); sb.append(other.text); numWords += other.numWords; numWordsInAnchorText += other.numWordsInAnchorText; numWordsInWrappedLines += other.numWordsInWrappedLines; numWrappedLines += other.numWrappedLines; offsetBlocksStart = Math .min(offsetBlocksStart, other.offsetBlocksStart); offsetBlocksEnd = Math.max(offsetBlocksEnd, other.offsetBlocksEnd); initDensities(); this.isContent |= other.isContent; if(containedTextElements == null) { containedTextElements = (BitSet)other.containedTextElements.clone(); } else { containedTextElements.or(other.containedTextElements); } numFullTextWords += other.numFullTextWords; if (other.labels != null) { if (labels == null) { labels = new HashSet(other.labels); } else { labels.addAll(other.labels); } } tagLevel = Math.min(tagLevel, other.tagLevel); } private void initDensities() { if (numWordsInWrappedLines == 0) { numWordsInWrappedLines = numWords; numWrappedLines = 1; } textDensity = numWordsInWrappedLines / (float) numWrappedLines; linkDensity = numWords == 0 ? 0 : numWordsInAnchorText / (float) numWords; } public int getOffsetBlocksStart() { return offsetBlocksStart; } public int getOffsetBlocksEnd() { return offsetBlocksEnd; } public String toString() { return "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl="+tagLevel+"; nw="+numWords+";nwl="+numWrappedLines+";ld="+linkDensity+"]\t" + (isContent?"CONTENT":"boilerplate") + "," + labels + "\n" + getText(); } /** * Adds an arbitrary String label to this {@link TextBlock}. * * @param label The label * @see DefaultLabels */ public void addLabel(final String label) { if (labels == null) { labels = new HashSet(2); } labels.add(label); } /** * Checks whether this TextBlock has the given label. * * @param label The label * @return true if this block is marked by the given label. */ public boolean hasLabel(final String label) { return labels != null && labels.contains(label); } public boolean removeLabel(final String label) { return labels != null && labels.remove(label); } /** * Returns the labels associated to this TextBlock, or null if no such labels * exist. * * NOTE: The returned instance is the one used directly in TextBlock. You have full access * to the data structure. However it is recommended to use the label-specific methods in {@link TextBlock} * whenever possible. * * @return Returns the set of labels, or null if no labels was added yet. */ public Set getLabels() { return labels; } /** * Adds a set of labels to this {@link TextBlock}. * null-references are silently ignored. * * @param l The labels to be added. */ public void addLabels(final Set l) { if(l == null) { return; } if(this.labels == null) { this.labels = new HashSet(l); } else { this.labels.addAll(l); } } /** * Adds a set of labels to this {@link TextBlock}. * null-references are silently ignored. * * @param l The labels to be added. */ public void addLabels(final String... l) { if(l == null) { return; } if(this.labels == null) { this.labels = new HashSet(); } for(final String label : l) { this.labels.add(label); } } /** * Returns the containedTextElements BitSet, or null. * @return */ public BitSet getContainedTextElements() { return containedTextElements; } @Override protected Object clone() { final TextBlock clone; try { clone = (TextBlock)super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } if(text != null && !(text instanceof String)) { clone.text = new StringBuilder(text); } if(labels != null && !labels.isEmpty()) { clone.labels = new HashSet(labels); } if(containedTextElements != null) { clone.containedTextElements = (BitSet)containedTextElements.clone(); } return clone; } public int getTagLevel() { return tagLevel; } public void setTagLevel(int tagLevel) { this.tagLevel = tagLevel; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/document/TextDocument.java100644 0 0 7173 11502365054 23046 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.document; import java.util.List; /** * A text document, consisting of one or more {@link TextBlock}s. * * @author Christian Kohlschütter */ public class TextDocument { final List textBlocks; String title; /** * Creates a new {@link TextDocument} with given {@link TextBlock}s, and no * title. * * @param textBlocks * The text blocks of this document. */ public TextDocument(final List textBlocks) { this(null, textBlocks); } /** * Creates a new {@link TextDocument} with given {@link TextBlock}s and * given title. * * @param title * The "main" title for this text document. * @param textBlocks * The text blocks of this document. */ public TextDocument(final String title, final List textBlocks) { this.title = title; this.textBlocks = textBlocks; } /** * Returns the {@link TextBlock}s of this document. * * @return A list of {@link TextBlock}s, in sequential order of appearance. */ public List getTextBlocks() { return textBlocks; } /** * Returns the "main" title for this document, or null if no * such title has ben set. * * @return The "main" title. */ public String getTitle() { return title; } /** * Updates the "main" title for this document. * * @param title */ public void setTitle(final String title) { this.title = title; } /** * Returns the {@link TextDocument}'s content. * * @return The content text. */ public String getContent() { return getText(true, false); } /** * Returns the {@link TextDocument}'s content, non-content or both * * @param includeContent Whether to include TextBlocks marked as "content". * @param includeNonContent Whether to include TextBlocks marked as "non-content". * @return The text. */ public String getText(boolean includeContent, boolean includeNonContent) { StringBuilder sb = new StringBuilder(); LOOP: for (TextBlock block : getTextBlocks()) { if(block.isContent()) { if(!includeContent) { continue LOOP; } } else { if(!includeNonContent) { continue LOOP; } } sb.append(block.getText()); sb.append('\n'); } return sb.toString(); } /** * Returns detailed debugging information about the contained {@link TextBlock}s. * * @return Debug information. */ public String debugString() { StringBuilder sb = new StringBuilder(); for(TextBlock tb : getTextBlocks()) { sb.append(tb.toString()); sb.append('\n'); } return sb.toString(); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/document/TextDocumentStatistics.java100644 0 0 3425 11502365054 25115 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009, 2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.document; /** * Provides shallow statistics on a given TextDocument * * @author Christian Kohlschuetter */ public final class TextDocumentStatistics { private int numWords = 0; private int numBlocks = 0; /** * Computes statistics on a given {@link TextDocument}. * * @param doc The {@link TextDocument}. * @param contentOnly if true then o */ public TextDocumentStatistics(final TextDocument doc, final boolean contentOnly) { for (TextBlock tb : doc.getTextBlocks()) { if (contentOnly && !tb.isContent()) { continue; } numWords += tb.getNumWords(); numBlocks++; } } /** * Returns the average number of words at block-level (= overall number of words divided by * the number of blocks). * * @return Average */ public float avgNumWords() { return numWords / (float) numBlocks; } /** * Returns the overall number of words in all blocks. * * @return Sum */ public int getNumWords() { return numWords; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/document/package.html100644 0 0 161 11502365054 22007 0ustar 0 0

The classes in this package represent the simple Boilerpipe document model.

boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/estimators/SimpleEstimator.java100644 0 0 3676 11502365054 24124 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009, 2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.estimators; import de.l3s.boilerpipe.BoilerpipeExtractor; import de.l3s.boilerpipe.document.TextDocumentStatistics; import de.l3s.boilerpipe.extractors.ArticleExtractor; import de.l3s.boilerpipe.extractors.DefaultExtractor; /** * Estimates the "goodness" of a {@link BoilerpipeExtractor} on a given document. * * @author Christian Kohlschütter */ public final class SimpleEstimator { /** * Returns the singleton instance of {@link SimpleEstimator} */ public static final SimpleEstimator INSTANCE = new SimpleEstimator(); private SimpleEstimator() { } /** * Given the statistics of the document before and after applying the {@link BoilerpipeExtractor}, * can we regard the extraction quality (too) low? * * Works well with {@link DefaultExtractor}, {@link ArticleExtractor} and others. * * @param dsBefore * @param dsAfter * @return true if low quality is to be expected. */ public boolean isLowQuality(final TextDocumentStatistics dsBefore, final TextDocumentStatistics dsAfter) { if (dsBefore.getNumWords() < 90 || dsAfter.getNumWords() < 70) { return true; } if (dsAfter.avgNumWords() < 25) { return true; } return false; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/ArticleExtractor.java100644 0 0 5026 11502365054 24255 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.extractors; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.filters.english.IgnoreBlocksAfterContentFilter; import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier; import de.l3s.boilerpipe.filters.english.TerminatingBlocksFinder; import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion; import de.l3s.boilerpipe.filters.heuristics.DocumentTitleMatchClassifier; import de.l3s.boilerpipe.filters.heuristics.ExpandTitleToContentFilter; import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter; import de.l3s.boilerpipe.filters.simple.BoilerplateBlockFilter; /** * A full-text extractor which is tuned towards news articles. In this scenario * it achieves higher accuracy than {@link DefaultExtractor}. * * @author Christian Kohlschütter */ public final class ArticleExtractor extends ExtractorBase { public static final ArticleExtractor INSTANCE = new ArticleExtractor(); /** * Returns the singleton instance for {@link ArticleExtractor}. */ public static ArticleExtractor getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return TerminatingBlocksFinder.INSTANCE.process(doc) | new DocumentTitleMatchClassifier(doc.getTitle()).process(doc) | NumWordsRulesClassifier.INSTANCE.process(doc) | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1.process(doc) | BoilerplateBlockFilter.INSTANCE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY.process(doc) | KeepLargestBlockFilter.INSTANCE.process(doc) | ExpandTitleToContentFilter.INSTANCE.process(doc); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/ArticleSentencesExtractor.java100644 0 0 3262 11502365054 26125 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.extractors; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.filters.simple.MinClauseWordsFilter; import de.l3s.boilerpipe.filters.simple.SplitParagraphBlocksFilter; /** * A full-text extractor which is tuned towards extracting sentences from news articles. * * @author Christian Kohlschütter */ public final class ArticleSentencesExtractor extends ExtractorBase { public static final ArticleSentencesExtractor INSTANCE = new ArticleSentencesExtractor(); /** * Returns the singleton instance for {@link ArticleSentencesExtractor}. */ public static ArticleSentencesExtractor getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return ArticleExtractor.INSTANCE.process(doc) | SplitParagraphBlocksFilter.INSTANCE.process(doc) | MinClauseWordsFilter.INSTANCE.process(doc); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/CanolaExtractor.java100644 0 0 6267 11502365054 24077 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009, 2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.extractors; import java.util.List; import java.util.ListIterator; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.estimators.SimpleEstimator; /** * A full-text extractor trained on krdwrd Canola * . Works well with {@link SimpleEstimator}, too. * * @author Christian Kohlschütter */ public class CanolaExtractor extends ExtractorBase { public static final CanolaExtractor INSTANCE = new CanolaExtractor(); /** * Returns the singleton instance for {@link CanolaExtractor}. */ public static CanolaExtractor getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return CLASSIFIER.process(doc); } /** * The actual classifier, exposed. */ public static final BoilerpipeFilter CLASSIFIER = new BoilerpipeFilter() { public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List textBlocks = doc.getTextBlocks(); boolean hasChanges = false; ListIterator it = textBlocks.listIterator(); if (!it.hasNext()) { return false; } TextBlock prevBlock = TextBlock.EMPTY_START; TextBlock currentBlock = it.next(); TextBlock nextBlock = it.hasNext() ? it.next() : TextBlock.EMPTY_START; hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges; if (nextBlock != TextBlock.EMPTY_START) { while (it.hasNext()) { prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = it.next(); hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges; } prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = TextBlock.EMPTY_START; hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges; } return hasChanges; } protected boolean classify(final TextBlock prev, final TextBlock curr, final TextBlock next) { final boolean isContent = (curr.getLinkDensity() > 0 && next .getNumWords() > 11) || (curr.getNumWords() > 19 || (next.getNumWords() > 6 && next.getLinkDensity() == 0 && prev.getLinkDensity() == 0 && (curr .getNumWords() > 6 || prev.getNumWords() > 7 || next .getNumWords() > 19))); return curr.setIsContent(isContent); } }; } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/CommonExtractors.java100644 0 0 2370 11502365054 24304 0ustar 0 0 package de.l3s.boilerpipe.extractors; import de.l3s.boilerpipe.BoilerpipeExtractor; /** * Provides quick access to common {@link BoilerpipeExtractor}s. * * @author Christian Kohlschütter */ public final class CommonExtractors { private CommonExtractors() { } /** * Works very well for most types of Article-like HTML. */ public static final ArticleExtractor ARTICLE_EXTRACTOR = ArticleExtractor.INSTANCE; /** * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics. */ public static final DefaultExtractor DEFAULT_EXTRACTOR = DefaultExtractor.INSTANCE; /** * Like {@link DefaultExtractor}, but keeps the largest text block only. */ public static final LargestContentExtractor LARGEST_CONTENT_EXTRACTOR = LargestContentExtractor.INSTANCE; /** * Trained on krdwrd Canola (different definition of "boilerplate"). You may * give it a try. */ public static final CanolaExtractor CANOLA_EXTRACTOR = CanolaExtractor.INSTANCE; /** * Dummy Extractor; should return the input text. Use this to double-check * that your problem is within a particular {@link BoilerpipeExtractor}, or * somewhere else. */ public static final KeepEverythingExtractor KEEP_EVERYTHING_EXTRACTOR = KeepEverythingExtractor.INSTANCE; } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/DefaultExtractor.java100644 0 0 3243 11502365054 24255 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.extractors; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.filters.english.DensityRulesClassifier; import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion; import de.l3s.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor; /** * A quite generic full-text extractor. * * @author Christian Kohlschütter */ public class DefaultExtractor extends ExtractorBase { public static final DefaultExtractor INSTANCE = new DefaultExtractor(); /** * Returns the singleton instance for {@link DefaultExtractor}. */ public static DefaultExtractor getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return SimpleBlockFusionProcessor.INSTANCE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1.process(doc) | DensityRulesClassifier.INSTANCE.process(doc); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/ExtractorBase.java100644 0 0 7444 11502365054 23552 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.extractors; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.net.URL; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import de.l3s.boilerpipe.BoilerpipeExtractor; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.sax.BoilerpipeSAXInput; import de.l3s.boilerpipe.sax.HTMLFetcher; /** * The base class of Extractors. Also provides some helper methods to quickly * retrieve the text that remained after processing. * * @author Christian Kohlschütter */ public abstract class ExtractorBase implements BoilerpipeExtractor { /** * Extracts text from the HTML code given as a String. * * @param html The HTML code as a String. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final String html) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(new InputSource( new StringReader(html))).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } } /** * Extracts text from the HTML code available from the given {@link InputSource}. * * @param is The InputSource containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final InputSource is) throws BoilerpipeProcessingException { try { return getText(new BoilerpipeSAXInput(is).getTextDocument()); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } } /** * Extracts text from the HTML code available from the given {@link URL}. * NOTE: This method is mainly to be used for show case purposes. If you are * going to crawl the Web, consider using {@link #getText(InputSource)} * instead. * * @param url The URL pointing to the HTML code. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final URL url) throws BoilerpipeProcessingException { try { return getText(HTMLFetcher.fetch(url).toInputSource()); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } } /** * Extracts text from the HTML code available from the given {@link Reader}. * * @param r The Reader containing the HTML * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(final Reader r) throws BoilerpipeProcessingException { return getText(new InputSource(r)); } /** * Extracts text from the given {@link TextDocument} object. * * @param doc The {@link TextDocument}. * @return The extracted text. * @throws BoilerpipeProcessingException */ public String getText(TextDocument doc) throws BoilerpipeProcessingException { process(doc); return doc.getContent(); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/KeepEverythingExtractor.java100644 0 0 2460 11502365054 25622 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.extractors; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.filters.simple.MarkEverythingContentFilter; /** * Marks everything as content. * * @author Christian Kohlschütter */ public final class KeepEverythingExtractor extends ExtractorBase { public static final KeepEverythingExtractor INSTANCE = new KeepEverythingExtractor(); private KeepEverythingExtractor() { } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return MarkEverythingContentFilter.INSTANCE.process(doc); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/KeepEverythingWithMinKWordsExtractor.java100644 0 0 3357 11502365054 30262 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.extractors; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.filters.heuristics.SimpleBlockFusionProcessor; import de.l3s.boilerpipe.filters.simple.MarkEverythingContentFilter; import de.l3s.boilerpipe.filters.simple.MinWordsFilter; /** * A full-text extractor which extracts the largest text component of a page. * For news articles, it may perform better than the {@link DefaultExtractor}, * but usually worse than {@link ArticleExtractor}. * * @author Christian Kohlschütter */ public final class KeepEverythingWithMinKWordsExtractor extends ExtractorBase { private final MinWordsFilter filter; public KeepEverythingWithMinKWordsExtractor(final int kMin) { this.filter = new MinWordsFilter(kMin); } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return SimpleBlockFusionProcessor.INSTANCE.process(doc) | MarkEverythingContentFilter.INSTANCE.process(doc) | filter.process(doc); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/LargestContentExtractor.java100644 0 0 3625 11502365054 25631 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.extractors; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier; import de.l3s.boilerpipe.filters.heuristics.BlockProximityFusion; import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter; /** * A full-text extractor which extracts the largest text component of a page. * For news articles, it may perform better than the {@link DefaultExtractor}, * but usually worse than {@link ArticleExtractor}. * * @author Christian Kohlschütter */ public final class LargestContentExtractor extends ExtractorBase { public static final LargestContentExtractor INSTANCE = new LargestContentExtractor(); private LargestContentExtractor() { } /** * Returns the singleton instance for {@link LargestContentExtractor}. */ public static LargestContentExtractor getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return NumWordsRulesClassifier.INSTANCE.process(doc) | BlockProximityFusion.MAX_DISTANCE_1.process(doc) | KeepLargestBlockFilter.INSTANCE.process(doc); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/NumWordsRulesExtractor.java100644 0 0 3015 11502365054 25457 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.extractors; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.filters.english.NumWordsRulesClassifier; /** * A quite generic full-text extractor solely based upon the number of words per * block (the current, the previous and the next block). * * @author Christian Kohlschütter */ public class NumWordsRulesExtractor extends ExtractorBase { public static final NumWordsRulesExtractor INSTANCE = new NumWordsRulesExtractor(); /** * Returns the singleton instance for {@link NumWordsRulesExtractor}. */ public static NumWordsRulesExtractor getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { return NumWordsRulesClassifier.INSTANCE.process(doc); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/extractors/package.html100644 0 0 177 11502365054 22376 0ustar 0 0

This package contains some standard extractors (i.e., completely piped BoilerpipeFilters)

boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/english/DensityRulesClassifier.java100644 0 0 7653 11502365055 26351 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.english; import java.util.List; import java.util.ListIterator; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Classifies {@link TextBlock}s as content/not-content through rules that have * been determined using the C4.8 machine learning algorithm, as described in the * paper "Boilerplate Detection using Shallow Text Features", particularly using * text densities and link densities. * * @author Christian Kohlschütter */ public class DensityRulesClassifier implements BoilerpipeFilter { public static final DensityRulesClassifier INSTANCE = new DensityRulesClassifier(); /** * Returns the singleton instance for RulebasedBoilerpipeClassifier. */ public static DensityRulesClassifier getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List textBlocks = doc.getTextBlocks(); boolean hasChanges = false; ListIterator it = textBlocks.listIterator(); if (!it.hasNext()) { return false; } TextBlock prevBlock = TextBlock.EMPTY_START; TextBlock currentBlock = it.next(); TextBlock nextBlock = it.hasNext() ? it.next() : TextBlock.EMPTY_START; hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges; if (nextBlock != TextBlock.EMPTY_START) { while (it.hasNext()) { prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = it.next(); hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges; } prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = TextBlock.EMPTY_START; hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges; } return hasChanges; } protected boolean classify(final TextBlock prev, final TextBlock curr, final TextBlock next) { final boolean isContent; if (curr.getLinkDensity() <= 0.333333) { if (prev.getLinkDensity() <= 0.555556) { if (curr.getTextDensity() <= 9) { if (next.getTextDensity() <= 10) { if (prev.getTextDensity() <= 4) { isContent = false; } else { isContent = true; } } else { isContent = true; } } else { if (next.getTextDensity() == 0) { isContent = false; } else { isContent = true; } } } else { if (next.getTextDensity() <= 11) { isContent = false; } else { isContent = true; } } } else { isContent = false; } return curr.setIsContent(isContent); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/english/HeuristicFilterBase.java100644 0 0 2350 11502365055 25577 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.english; import de.l3s.boilerpipe.document.TextBlock; /** * Base class for some heuristics that are used by boilerpipe filters. * * @author Christian Kohlschütter */ abstract class HeuristicFilterBase { protected static int getNumFullTextWords(final TextBlock tb) { return getNumFullTextWords(tb, 9); } protected static int getNumFullTextWords(final TextBlock tb, float minTextDensity) { if(tb.getTextDensity() >= minTextDensity) { return tb.getNumWords(); } else { return 0; } } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFilter.java100644 0 0 5550 11502365055 27750 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009,2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.english; import java.util.Iterator; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.DefaultLabels; /** * Marks all blocks as "non-content" that occur after blocks that have been * marked {@link DefaultLabels#INDICATES_END_OF_TEXT}. These marks are ignored * unless a minimum number of words in content blocks occur before this mark (default: 60). * This can be used in conjunction with an upstream {@link TerminatingBlocksFinder}. * * @author Christian Kohlschütter * @see TerminatingBlocksFinder */ public final class IgnoreBlocksAfterContentFilter extends HeuristicFilterBase implements BoilerpipeFilter { public static final IgnoreBlocksAfterContentFilter DEFAULT_INSTANCE = new IgnoreBlocksAfterContentFilter( 60); public static final IgnoreBlocksAfterContentFilter INSTANCE_200 = new IgnoreBlocksAfterContentFilter( 200); private final int minNumWords; /** * Returns the singleton instance for DeleteBlocksAfterContentFilter. */ public static IgnoreBlocksAfterContentFilter getDefaultInstance() { return DEFAULT_INSTANCE; } public IgnoreBlocksAfterContentFilter(final int minNumWords) { this.minNumWords = minNumWords; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; int numWords = 0; boolean foundEndOfText = false; for (Iterator it = doc.getTextBlocks().iterator(); it.hasNext();) { TextBlock block = it.next(); final boolean endOfText = block .hasLabel(DefaultLabels.INDICATES_END_OF_TEXT); if (block.isContent()) { numWords += getNumFullTextWords(block); } if (endOfText && numWords >= minNumWords) { foundEndOfText = true; } if (foundEndOfText) { changes = true; block.setIsContent(false); } } return changes; } } ././@LongLink100644 0 0 147 11605072140 10351 Lustar 0 0 boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.javaboilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/english/IgnoreBlocksAfterContentFromEndFilter.ja100644 0 0 4570 11502365055 30675 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.english; import java.util.List; import java.util.ListIterator; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.DefaultLabels; /** * Marks all blocks as "non-content" that occur after blocks that have been * marked {@link DefaultLabels#INDICATES_END_OF_TEXT}, and after any content block. * This filter can be used in conjunction with an upstream {@link TerminatingBlocksFinder}. * * @author Christian Kohlschütter * @see TerminatingBlocksFinder */ public final class IgnoreBlocksAfterContentFromEndFilter extends HeuristicFilterBase implements BoilerpipeFilter { public static final IgnoreBlocksAfterContentFromEndFilter INSTANCE = new IgnoreBlocksAfterContentFromEndFilter( ); private IgnoreBlocksAfterContentFromEndFilter() { } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; int words = 0; List blocks = doc.getTextBlocks(); if (!blocks.isEmpty()) { ListIterator it = blocks.listIterator(blocks.size()); TextBlock tb; while(it.hasPrevious()) { tb = it.previous(); if(tb.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT)) { tb.addLabel(DefaultLabels.STRICTLY_NOT_CONTENT); tb.removeLabel(DefaultLabels.MIGHT_BE_CONTENT); tb.setIsContent(false); changes = true; } else if(tb.isContent()) { words += tb.getNumWords(); if(words > 200) { break; } } } } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/english/KeepLargestFulltextBlockFilter.java100644 0 0 6045 11502365055 27763 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.english; import java.util.List; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.filters.heuristics.KeepLargestBlockFilter; import de.l3s.boilerpipe.labels.DefaultLabels; /** * Keeps the largest {@link TextBlock} only (by the number of words). In case of * more than one block with the same number of words, the first block is chosen. * All discarded blocks are marked "not content" and flagged as * {@link DefaultLabels#MIGHT_BE_CONTENT}. * * As opposed to {@link KeepLargestBlockFilter}, the number of words are * computed using {@link HeuristicFilterBase#getNumFullTextWords(TextBlock)}, which only counts * words that occur in text elements with at least 9 words and are thus believed to be full text. * * NOTE: Without language-specific fine-tuning (i.e., running the default instance), this filter * may lead to suboptimal results. You better use {@link KeepLargestBlockFilter} instead, which * works at the level of number-of-words instead of text densities. * * @author Christian Kohlschütter */ public final class KeepLargestFulltextBlockFilter extends HeuristicFilterBase implements BoilerpipeFilter { public static final KeepLargestFulltextBlockFilter INSTANCE = new KeepLargestFulltextBlockFilter(); public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { List textBlocks = doc.getTextBlocks(); if (textBlocks.size() < 2) { return false; } int max = -1; TextBlock largestBlock = null; int index = 0; for (TextBlock tb : textBlocks) { if (!tb.isContent()) { continue; } int numWords = getNumFullTextWords(tb); if (numWords > max) { largestBlock = tb; max = numWords; } index++; } if (largestBlock == null) { return false; } for (TextBlock tb : textBlocks) { if (tb == largestBlock) { tb.setIsContent(true); } else { tb.setIsContent(false); tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT); } } return true; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/english/MinFulltextWordsFilter.java100644 0 0 3730 11502365055 26342 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.english; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Keeps only those content blocks which contain at least k full-text words * (measured by {@link HeuristicFilterBase#getNumFullTextWords(TextBlock)}). k is 30 by default. * * @author Christian Kohlschütter */ public final class MinFulltextWordsFilter extends HeuristicFilterBase implements BoilerpipeFilter { public static final MinFulltextWordsFilter DEFAULT_INSTANCE = new MinFulltextWordsFilter( 30); private final int minWords; public static MinFulltextWordsFilter getDefaultInstance() { return DEFAULT_INSTANCE; } public MinFulltextWordsFilter(final int minWords) { this.minWords = minWords; } public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; for (TextBlock tb : doc.getTextBlocks()) { if (!tb.isContent()) { continue; } if (getNumFullTextWords(tb) < minWords) { tb.setIsContent(false); changes = true; } } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/english/NumWordsRulesClassifier.java100644 0 0 7670 11502365055 26507 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.english; import java.util.List; import java.util.ListIterator; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Classifies {@link TextBlock}s as content/not-content through rules that have * been determined using the C4.8 machine learning algorithm, as described in * the paper "Boilerplate Detection using Shallow Text Features" (WSDM 2010), * particularly using number of words per block and link density per block. * * @author Christian Kohlschütter */ public class NumWordsRulesClassifier implements BoilerpipeFilter { public static final NumWordsRulesClassifier INSTANCE = new NumWordsRulesClassifier(); /** * Returns the singleton instance for RulebasedBoilerpipeClassifier. */ public static NumWordsRulesClassifier getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List textBlocks = doc.getTextBlocks(); boolean hasChanges = false; ListIterator it = textBlocks.listIterator(); if (!it.hasNext()) { return false; } TextBlock prevBlock = TextBlock.EMPTY_START; TextBlock currentBlock = it.next(); TextBlock nextBlock = it.hasNext() ? it.next() : TextBlock.EMPTY_START; hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges; if (nextBlock != TextBlock.EMPTY_START) { while (it.hasNext()) { prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = it.next(); hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges; } prevBlock = currentBlock; currentBlock = nextBlock; nextBlock = TextBlock.EMPTY_START; hasChanges = classify(prevBlock, currentBlock, nextBlock) | hasChanges; } return hasChanges; } protected boolean classify(final TextBlock prev, final TextBlock curr, final TextBlock next) { final boolean isContent; if (curr.getLinkDensity() <= 0.333333) { if (prev.getLinkDensity() <= 0.555556) { if (curr.getNumWords() <= 16) { if (next.getNumWords() <= 15) { if (prev.getNumWords() <= 4) { isContent = false; } else { isContent = true; } } else { isContent = true; } } else { isContent = true; } } else { if (curr.getNumWords() <= 40) { if (next.getNumWords() <= 17) { isContent = false; } else { isContent = true; } } else { isContent = true; } } } else { isContent = false; } return curr.setIsContent(isContent); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/english/TerminatingBlocksFinder.java100644 0 0 7021 11506127153 26445 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.english; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.DefaultLabels; /** * Finds blocks which are potentially indicating the end of an article text and * marks them with {@link DefaultLabels#INDICATES_END_OF_TEXT}. This can be used * in conjunction with a downstream {@link IgnoreBlocksAfterContentFilter}. * * @author Christian Kohlschütter * @see IgnoreBlocksAfterContentFilter */ public class TerminatingBlocksFinder implements BoilerpipeFilter { public static final TerminatingBlocksFinder INSTANCE = new TerminatingBlocksFinder(); /** * Returns the singleton instance for TerminatingBlocksFinder. */ public static TerminatingBlocksFinder getInstance() { return INSTANCE; } // public static long timeSpent = 0; public boolean process(TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; // long t = System.currentTimeMillis(); for (TextBlock tb : doc.getTextBlocks()) { final int numWords = tb.getNumWords(); if (numWords < 15) { final String text = tb.getText().trim(); final int len = text.length(); if (len >= 8) { final String textLC = text.toLowerCase(); if (textLC.startsWith("comments") || startsWithNumber(textLC, len, " comments", " users responded in") || textLC.startsWith("© reuters") || textLC.startsWith("please rate this") || textLC.startsWith("post a comment") || textLC.contains("what you think...") || textLC.contains("add your comment") || textLC.contains("add comment") || textLC.contains("reader views") || textLC.contains("have your say") || textLC.contains("reader comments") || textLC.contains("rätta artikeln") || textLC .equals("thanks for your comments - this feedback is now closed")) { tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT); changes = true; } } } } // timeSpent += System.currentTimeMillis() - t; return changes; } /** * Checks whether the given text t starts with a sequence of digits, * followed by one of the given strings. * * @param t * The text to examine * @param len * The length of the text to examine * @param str * Any strings that may follow the digits. * @return true if at least one combination matches */ private static boolean startsWithNumber(final String t, final int len, final String... str) { int j = 0; while (j < len && isDigit(t.charAt(j))) { j++; } if (j != 0) { for (String s : str) { if (t.startsWith(s, j)) { return true; } } } return false; } private static boolean isDigit(final char c) { return c >= '0' && c <= '9'; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/english/package.html100644 0 0 360 11502365055 23274 0ustar 0 0

The BoilerpipeFilters in this package have only been tested on English text.

That is, they will probably work with other Western languages, but maybe need some parameter tuning to perform well.

boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/heuristics/AddPrecedingLabelsFilter.java100644 0 0 5261 11605070271 27233 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2011 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.heuristics; import java.util.List; import java.util.ListIterator; import java.util.Set; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Adds the labels of the preceding block to the current block, optionally adding a prefix. * * @author Christian Kohlschütter */ public final class AddPrecedingLabelsFilter implements BoilerpipeFilter { public static final AddPrecedingLabelsFilter INSTANCE = new AddPrecedingLabelsFilter(""); public static final AddPrecedingLabelsFilter INSTANCE_PRE = new AddPrecedingLabelsFilter("^"); private final String labelPrefix; /** * Creates a new {@link AddPrecedingLabelsFilter} instance. * * @param maxBlocksDistance The maximum distance in blocks. * @param contentOnly */ public AddPrecedingLabelsFilter(final String labelPrefix) { this.labelPrefix = labelPrefix; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List textBlocks = doc.getTextBlocks(); if (textBlocks.size() < 2) { return false; } boolean changes = false; int remaining = textBlocks.size(); TextBlock blockBelow = null; TextBlock block; for (ListIterator it = textBlocks.listIterator(textBlocks.size()); it .hasPrevious();) { if(--remaining <= 0) { break; } if(blockBelow == null) { blockBelow = it.previous(); continue; } block = it.previous(); Set labels = block.getLabels(); if(labels != null && !labels.isEmpty()) { for(String l : labels) { blockBelow.addLabel(labelPrefix+l); } changes = true; } blockBelow = block; } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/heuristics/ArticleMetadataFilter.java100644 0 0 2423 11502365055 26623 0ustar 0 0 package de.l3s.boilerpipe.filters.heuristics; import java.util.regex.Pattern; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.DefaultLabels; public class ArticleMetadataFilter implements BoilerpipeFilter { private static final Pattern[] PATTERNS_SHORT = new Pattern[] { Pattern .compile("^[0-9 \\,\\./]*\\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)?\\b[0-9 \\,\\:apm\\./]*([CPSDMGET]{2,3})?$"), Pattern.compile("^[Bb]y ") }; public static final ArticleMetadataFilter INSTANCE = new ArticleMetadataFilter(); private ArticleMetadataFilter() { } @Override public boolean process(TextDocument doc) throws BoilerpipeProcessingException { boolean changed = false; for (TextBlock tb : doc.getTextBlocks()) { if (tb.getNumWords() > 10) { continue; } final String text = tb.getText(); for (Pattern p : PATTERNS_SHORT) { if (p.matcher(text).find()) { changed = true; tb.setIsContent(true); tb.addLabel(DefaultLabels.ARTICLE_METADATA); } } } return changed; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/heuristics/BlockProximityFusion.java100644 0 0 10270 11502365055 26613 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.heuristics; import java.util.Iterator; import java.util.List; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. * This probably makes sense only in cases where an upstream filter already has removed some blocks. * * @author Christian Kohlschütter */ public final class BlockProximityFusion implements BoilerpipeFilter { private final int maxBlocksDistance; public static final BlockProximityFusion MAX_DISTANCE_1 = new BlockProximityFusion( 1, false, false); public static final BlockProximityFusion MAX_DISTANCE_1_SAME_TAGLEVEL = new BlockProximityFusion( 1, false, true); public static final BlockProximityFusion MAX_DISTANCE_1_CONTENT_ONLY = new BlockProximityFusion( 1, true, false); public static final BlockProximityFusion MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = new BlockProximityFusion( 1, true, true); private final boolean contentOnly; private final boolean sameTagLevelOnly; /** * Creates a new {@link BlockProximityFusion} instance. * * @param maxBlocksDistance The maximum distance in blocks. * @param contentOnly */ public BlockProximityFusion(final int maxBlocksDistance, final boolean contentOnly, final boolean sameTagLevelOnly) { this.maxBlocksDistance = maxBlocksDistance; this.contentOnly = contentOnly; this.sameTagLevelOnly = sameTagLevelOnly; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List textBlocks = doc.getTextBlocks(); if (textBlocks.size() < 2) { return false; } boolean changes = false; TextBlock prevBlock; int offset; if (contentOnly) { prevBlock = null; offset = 0; for (TextBlock tb : textBlocks) { offset++; if (tb.isContent()) { prevBlock = tb; break; } } if (prevBlock == null) { return false; } } else { prevBlock = textBlocks.get(0); offset = 1; } for (Iterator it = textBlocks.listIterator(offset); it .hasNext();) { TextBlock block = it.next(); if (!block.isContent()) { prevBlock = block; continue; } int diffBlocks = block.getOffsetBlocksStart() - prevBlock.getOffsetBlocksEnd() - 1; if (diffBlocks <= maxBlocksDistance) { boolean ok = true; if (contentOnly) { if (!prevBlock.isContent() || !block.isContent()) { ok = false; } } if(ok && sameTagLevelOnly && prevBlock.getTagLevel() != block.getTagLevel()) { ok = false; } if (ok) { prevBlock.mergeNext(block); it.remove(); changes = true; } else { prevBlock = block; } } else { prevBlock = block; } } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/heuristics/ContentFusion.java100644 0 0 3622 11502365055 25231 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.heuristics; import java.util.List; import java.util.ListIterator; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.DefaultLabels; public final class ContentFusion implements BoilerpipeFilter { public static final ContentFusion INSTANCE = new ContentFusion(); /** * Creates a new {@link ContentFusion} instance. * */ public ContentFusion() { } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List textBlocks = doc.getTextBlocks(); if (textBlocks.size() < 2) { return false; } TextBlock prevBlock = textBlocks.get(0); boolean changes = false; do { changes = false; for (ListIterator it = textBlocks.listIterator(1); it .hasNext();) { TextBlock block = it.next(); if (prevBlock.isContent() && block.getLinkDensity() < 0.56 && !block.hasLabel(DefaultLabels.STRICTLY_NOT_CONTENT)) { prevBlock.mergeNext(block); it.remove(); changes = true; } else { prevBlock = block; } } } while (changes); return true; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/heuristics/DocumentTitleMatchClassifier.java100644 0 0 7567 11502365055 30211 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.heuristics; import java.util.HashSet; import java.util.Set; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.DefaultLabels; /** * Marks {@link TextBlock}s which contain parts of the HTML * <TITLE> tag, using some heuristics which are quite * specific to the news domain. * * @author Christian Kohlschütter */ public final class DocumentTitleMatchClassifier implements BoilerpipeFilter { private final Set potentialTitles; public DocumentTitleMatchClassifier(String title) { if (title == null) { this.potentialTitles = null; } else { title = title.trim(); if (title.length() == 0) { this.potentialTitles = null; } else { this.potentialTitles = new HashSet(); potentialTitles.add(title); String p; p = getLongestPart(title, "[ ]*[\\|»|:][ ]*"); if(p != null) { potentialTitles.add(p); } p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)][ ]*"); if(p != null) { potentialTitles.add(p); } p = getLongestPart(title, "[ ]*[\\|»|:\\(\\)\\-][ ]*"); if(p != null) { potentialTitles.add(p); } p = getLongestPart(title, "[ ]*[\\|»|,|:\\(\\)\\-][ ]*"); if(p != null) { potentialTitles.add(p); } } } } public Set getPotentialTitles() { return potentialTitles; } private String getLongestPart(final String title, final String pattern) { String[] parts = title.split(pattern); if(parts.length == 1) { return null; } int longestNumWords = 0; String longestPart = ""; for (int i = 0; i < parts.length; i++) { String p = parts[i]; if (p.contains(".com")) { continue; } final int numWords = p.split("[\b]+").length; if (numWords > longestNumWords || p.length() > longestPart.length()) { longestNumWords = numWords; longestPart = p; } } if(longestPart.length() == 0) { return null; } else { return longestPart.trim(); } } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { if (potentialTitles == null) { return false; } boolean changes = false; for (final TextBlock tb : doc.getTextBlocks()) { final String text = tb.getText().trim(); for(String candidate : potentialTitles) { if(candidate.equals(text)) { tb.addLabel(DefaultLabels.TITLE); changes = true; } } } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/heuristics/ExpandTitleToContentFilter.java100644 0 0 4663 11502365055 27666 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.heuristics; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.DefaultLabels; /** * Marks all {@link TextBlock}s "content" which are between the headline and the part that * has already been marked content, if they are marked {@link DefaultLabels#MIGHT_BE_CONTENT}. * * This filter is quite specific to the news domain. * * @author Christian Kohlschütter */ public final class ExpandTitleToContentFilter implements BoilerpipeFilter { public static final ExpandTitleToContentFilter INSTANCE = new ExpandTitleToContentFilter(); /** * Returns the singleton instance for ExpandTitleToContentFilter. */ public static ExpandTitleToContentFilter getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { int i = 0; int title = -1; int contentStart = -1; for (TextBlock tb : doc.getTextBlocks()) { if (contentStart == -1 && tb.hasLabel(DefaultLabels.TITLE)) { title = i; contentStart = -1; } if (contentStart == -1 && tb.isContent()) { contentStart = i; } i++; } if (contentStart <= title || title == -1) { return false; } boolean changes = false; for (TextBlock tb : doc.getTextBlocks().subList(title, contentStart)) { if (tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT)) { changes = tb.setIsContent(true) | changes; } } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/heuristics/KeepLargestBlockFilter.java100644 0 0 6225 11502365055 26764 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.heuristics; import java.util.List; import java.util.ListIterator; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.DefaultLabels; /** * Keeps the largest {@link TextBlock} only (by the number of words). In case of * more than one block with the same number of words, the first block is chosen. * All discarded blocks are marked "not content" and flagged as * {@link DefaultLabels#MIGHT_BE_CONTENT}. * * Note that, by default, only TextBlocks marked as "content" are taken into consideration. * * @author Christian Kohlschütter */ public final class KeepLargestBlockFilter implements BoilerpipeFilter { public static final KeepLargestBlockFilter INSTANCE = new KeepLargestBlockFilter( false); public static final KeepLargestBlockFilter INSTANCE_EXPAND_TO_SAME_TAGLEVEL = new KeepLargestBlockFilter( true); private final boolean expandToSameLevelText; public KeepLargestBlockFilter(boolean expandToSameLevelText) { this.expandToSameLevelText = expandToSameLevelText; } public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { List textBlocks = doc.getTextBlocks(); if (textBlocks.size() < 2) { return false; } int maxNumWords = -1; TextBlock largestBlock = null; int level = -1; int i = 0; int n = -1; for (TextBlock tb : textBlocks) { if (tb.isContent()) { final int nw = tb.getNumWords(); if (nw > maxNumWords) { largestBlock = tb; maxNumWords = nw; n = i; if (expandToSameLevelText) { level = tb.getTagLevel(); } } } i++; } for (TextBlock tb : textBlocks) { if (tb == largestBlock) { tb.setIsContent(true); } else { tb.setIsContent(false); tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT); } } if (expandToSameLevelText && n != -1) { for (ListIterator it = textBlocks.listIterator(n); it .hasPrevious();) { TextBlock tb = it.previous(); final int tl = tb.getTagLevel(); if(tl < level) { break; } else if(tl == level) { tb.setIsContent(true); } } for (ListIterator it = textBlocks.listIterator(n); it .hasNext();) { TextBlock tb = it.next(); final int tl = tb.getTagLevel(); if(tl < level) { break; } else if(tl == level) { tb.setIsContent(true); } } } return true; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/heuristics/LabelFusion.java100644 0 0 5420 11502365055 24634 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.heuristics; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.DefaultLabels; /** * Fuses adjacent blocks if their labels are equal. * * @author Christian Kohlschütter */ public final class LabelFusion implements BoilerpipeFilter { public static final LabelFusion INSTANCE = new LabelFusion(""); private final String labelPrefix; /** * Creates a new {@link LabelFusion} instance. * * @param maxBlocksDistance The maximum distance in blocks. * @param contentOnly */ public LabelFusion(final String labelPrefix) { this.labelPrefix = labelPrefix; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List textBlocks = doc.getTextBlocks(); if (textBlocks.size() < 2) { return false; } boolean changes = false; TextBlock prevBlock = textBlocks.get(0); int offset = 1; for (Iterator it = textBlocks.listIterator(offset); it .hasNext();) { TextBlock block = it.next(); if(equalLabels(prevBlock.getLabels(), block.getLabels())) { prevBlock.mergeNext(block); it.remove(); changes = true; } else { prevBlock = block; } } return changes; } private boolean equalLabels(Set labels, Set labels2) { if(labels == null || labels2 == null) { return false; } return markupLabelsOnly(labels).equals(markupLabelsOnly(labels2)); } private Set markupLabelsOnly(final Set set1) { Set set = new HashSet(set1); for(Iterator it = set.iterator(); it.hasNext(); ) { if(!it.next().startsWith(DefaultLabels.MARKUP_PREFIX)) { it.remove(); } } return set; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/heuristics/SimpleBlockFusionProcessor.java100644 0 0 4145 11502365055 27724 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.heuristics; import java.util.Iterator; import java.util.List; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Merges two subsequent blocks if their text densities are equal. * * @author Christian Kohlschütter */ public class SimpleBlockFusionProcessor implements BoilerpipeFilter { public static final SimpleBlockFusionProcessor INSTANCE = new SimpleBlockFusionProcessor(); /** * Returns the singleton instance for BlockFusionProcessor. */ public static SimpleBlockFusionProcessor getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List textBlocks = doc.getTextBlocks(); boolean changes = false; if (textBlocks.size() < 2) { return false; } TextBlock b1 = textBlocks.get(0); for (Iterator it = textBlocks.listIterator(1); it.hasNext();) { TextBlock b2 = it.next(); final boolean similar = (b1.getTextDensity() == b2.getTextDensity()); if(similar) { b1.mergeNext(b2); it.remove(); changes = true; } else { b1 = b2; } } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/heuristics/package.html100644 0 0 140 11502365055 24021 0ustar 0 0

The BoilerpipeFilters in this package are pure heuristics.

boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/simple/BoilerplateBlockFilter.java100644 0 0 3522 11502365055 26124 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.simple; import java.util.Iterator; import java.util.List; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Removes {@link TextBlock}s which have explicitly been marked as "not content". * * @author Christian Kohlschütter */ public final class BoilerplateBlockFilter implements BoilerpipeFilter { public static final BoilerplateBlockFilter INSTANCE = new BoilerplateBlockFilter(); /** * Returns the singleton instance for BoilerplateBlockFilter. */ public static BoilerplateBlockFilter getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List textBlocks = doc.getTextBlocks(); boolean hasChanges = false; for (Iterator it = textBlocks.iterator(); it.hasNext();) { TextBlock tb = it.next(); if (!tb.isContent()) { it.remove(); hasChanges = true; } } return hasChanges; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/simple/InvertedFilter.java100644 0 0 3001 11502365055 24457 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.simple; import java.util.List; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Reverts the "isContent" flag for all {@link TextBlock}s * * @author Christian Kohlschütter */ public final class InvertedFilter implements BoilerpipeFilter { public static final InvertedFilter INSTANCE = new InvertedFilter(); private InvertedFilter() { } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List tbs = doc.getTextBlocks(); if (tbs.isEmpty()) { return false; } for (TextBlock tb : tbs) { tb.setIsContent(!tb.isContent()); } return true; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/simple/LabelToBoilerplateFilter.java100644 0 0 3644 11502365055 26421 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.simple; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.DefaultLabels; /** * Marks all blocks that contain a given label as "boilerplate". * * @author Christian Kohlschütter */ public final class LabelToBoilerplateFilter implements BoilerpipeFilter { public static final LabelToBoilerplateFilter INSTANCE_STRICTLY_NOT_CONTENT = new LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT); private String[] labels; public LabelToBoilerplateFilter(final String... label) { this.labels = label; } public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) { if (tb.isContent()) { for (String label : labels) { if (tb.hasLabel(label)) { tb.setIsContent(false); changes = true; continue BLOCK_LOOP; } } } } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/simple/LabelToContentFilter.java100644 0 0 3335 11502365055 25566 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009, 2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.simple; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Marks all blocks that contain a given label as "content". * * @author Christian Kohlschütter */ public final class LabelToContentFilter implements BoilerpipeFilter { private String[] labels; public LabelToContentFilter(final String... label) { this.labels = label; } public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) { if (!tb.isContent()) { for (String label : labels) { if (tb.hasLabel(label)) { tb.setIsContent(true); changes = true; continue BLOCK_LOOP; } } } } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/simple/MarkEverythingContentFilter.java100644 0 0 3017 11502365055 27200 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.simple; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Marks all blocks as content. * * @author Christian Kohlschütter */ public final class MarkEverythingContentFilter implements BoilerpipeFilter { public static final MarkEverythingContentFilter INSTANCE = new MarkEverythingContentFilter(); private MarkEverythingContentFilter() { } public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; for (TextBlock tb : doc.getTextBlocks()) { if (!tb.isContent()) { tb.setIsContent(true); changes = true; } } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/simple/MinClauseWordsFilter.java100644 0 0 7060 11502365055 25607 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.simple; import java.util.regex.Matcher; import java.util.regex.Pattern; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Keeps only blocks that have at least one segment fragment ("clause") with at * least k words (default: 5). * * NOTE: You might consider using the {@link SplitParagraphBlocksFilter} * upstream. * * @author Christian Kohlschütter * @see SplitParagraphBlocksFilter */ public final class MinClauseWordsFilter implements BoilerpipeFilter { public static final MinClauseWordsFilter INSTANCE = new MinClauseWordsFilter( 5, false); private int minWords; private final boolean acceptClausesWithoutDelimiter; public MinClauseWordsFilter(final int minWords) { this(minWords, false); } public MinClauseWordsFilter(final int minWords, final boolean acceptClausesWithoutDelimiter) { this.minWords = minWords; this.acceptClausesWithoutDelimiter = acceptClausesWithoutDelimiter; } private final Pattern PAT_CLAUSE_DELIMITER = Pattern .compile("[\\p{L}\\d][\\,\\.\\:\\;\\!\\?]+([ \\n\\r]+|$)"); private final Pattern PAT_WHITESPACE = Pattern.compile("[ \\n\\r]+"); public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; for (TextBlock tb : doc.getTextBlocks()) { if (!tb.isContent()) { continue; } final String text = tb.getText(); Matcher m = PAT_CLAUSE_DELIMITER.matcher(text); boolean found = m.find(); int start = 0; int end; boolean hasClause = false; while (found) { end = m.start() + 1; hasClause = isClause(text.subSequence(start, end)); start = m.end(); if (hasClause) { break; } found = m.find(); } end = text.length(); // since clauses should *always end* with a delimiter, we normally // don't consider text without one if (acceptClausesWithoutDelimiter) { hasClause |= isClause(text.subSequence(start, end)); } if (!hasClause) { tb.setIsContent(false); changes = true; // System.err.println("IS NOT CONTENT: " + text); } } return changes; } private boolean isClause(final CharSequence text) { Matcher m = PAT_WHITESPACE.matcher(text); int n = 1; while (m.find()) { n++; if (n >= minWords) { return true; } } return n >= minWords; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/simple/MinWordsFilter.java100644 0 0 3154 11502365055 24452 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.simple; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Keeps only those content blocks which contain at least k words. * * @author Christian Kohlschütter */ public final class MinWordsFilter implements BoilerpipeFilter { private final int minWords; public MinWordsFilter(final int minWords) { this.minWords = minWords; } public boolean process(final TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; for (TextBlock tb : doc.getTextBlocks()) { if (!tb.isContent()) { continue; } if (tb.getNumWords() < minWords) { tb.setIsContent(false); changes = true; } } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/simple/SplitParagraphBlocksFilter.java100644 0 0 5146 11502365055 26772 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.filters.simple; import java.util.ArrayList; import java.util.List; import java.util.Set; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Splits TextBlocks at paragraph boundaries. * * NOTE: This is not fully supported (i.e., it will break highlighting support * via #getContainedTextElements()), but this one probably is necessary for some other * filters. * * @author Christian Kohlschütter * @see MinClauseWordsFilter */ public final class SplitParagraphBlocksFilter implements BoilerpipeFilter { public static final SplitParagraphBlocksFilter INSTANCE = new SplitParagraphBlocksFilter(); /** * Returns the singleton instance for TerminatingBlocksFinder. */ public static SplitParagraphBlocksFilter getInstance() { return INSTANCE; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { boolean changes = false; final List blocks = doc.getTextBlocks(); final List blocksNew = new ArrayList(); for (TextBlock tb : blocks) { final String text = tb.getText(); final String[] paragraphs = text.split("[\n\r]+"); if (paragraphs.length < 2) { blocksNew.add(tb); continue; } final boolean isContent = tb.isContent(); final Set labels = tb.getLabels(); for (String p : paragraphs) { final TextBlock tbP = new TextBlock(p); tbP.setIsContent(isContent); tbP.addLabels(labels); blocksNew.add(tbP); changes = true; } } if (changes) { blocks.clear(); blocks.addAll(blocksNew); } return changes; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/simple/SurroundingToContentFilter.java100644 0 0 3105 11502365055 27061 0ustar 0 0 package de.l3s.boilerpipe.filters.simple; import java.util.Iterator; import java.util.List; import de.l3s.boilerpipe.BoilerpipeFilter; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.conditions.TextBlockCondition; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; public class SurroundingToContentFilter implements BoilerpipeFilter { public static final SurroundingToContentFilter INSTANCE_TEXT = new SurroundingToContentFilter(new TextBlockCondition() { @Override public boolean meetsCondition(TextBlock tb) { return tb.getLinkDensity() == 0 && tb.getNumWords() > 6; } }); private final TextBlockCondition cond; public SurroundingToContentFilter(final TextBlockCondition cond) { this.cond = cond; } public boolean process(TextDocument doc) throws BoilerpipeProcessingException { List tbs = doc.getTextBlocks(); if (tbs.size() < 3) { return false; } TextBlock a = tbs.get(0); TextBlock b = tbs.get(1); TextBlock c; boolean hasChanges = false; for (Iterator it= tbs.listIterator(2);it.hasNext();) { c = it.next(); if(!b.isContent() && a.isContent() && c.isContent() && cond.meetsCondition(b)) { b.setIsContent(true); hasChanges = true; } a = c; if(!it.hasNext()) { break; } b = it.next(); } return hasChanges; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/filters/simple/package.html100644 0 0 215 11502365055 23133 0ustar 0 0

The BoilerpipeFilters in this package are straight-forward and probably not really specific to English.

boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/labels/ConditionalLabelAction.java100644 0 0 2460 11502365055 24403 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009, 2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.labels; import de.l3s.boilerpipe.conditions.TextBlockCondition; import de.l3s.boilerpipe.document.TextBlock; /** * Adds labels to a {@link TextBlock} if the given criteria are met. * * @author Christian Kohlschütter */ public final class ConditionalLabelAction extends LabelAction { private final TextBlockCondition condition; public ConditionalLabelAction(TextBlockCondition condition, String... labels) { super(labels); this.condition = condition; } public void addTo(final TextBlock tb) { if (condition.meetsCondition(tb)) { addLabelsTo(tb); } } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/labels/DefaultLabels.java100644 0 0 2757 11502365055 22562 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.labels; import de.l3s.boilerpipe.document.TextBlock; /** * Some pre-defined labels which can be used in conjunction with * {@link TextBlock#addLabel(String)} and {@link TextBlock#hasLabel(String)}. * * @author Christian Kohlschütter */ public final class DefaultLabels { public static final String TITLE = "de.l3s.boilerpipe/TITLE"; public static final String ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA"; public static final String INDICATES_END_OF_TEXT = "de.l3s.boilerpipe/INDICATES_END_OF_TEXT"; public static final String MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT"; public static final String STRICTLY_NOT_CONTENT = "de.l3s.boilerpipe/STRICTLY_NOT_CONTENT"; public static final String HR = "de.l3s.boilerpipe/HR"; public static final String MARKUP_PREFIX = "<"; } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/labels/LabelAction.java100644 0 0 2457 11502365055 22225 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009, 2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.labels; import java.util.Arrays; import de.l3s.boilerpipe.document.TextBlock; /** * Helps adding labels to {@link TextBlock}s. * * @author Christian Kohlschütter * @see ConditionalLabelAction */ public class LabelAction { protected final String[] labels; public LabelAction(String... labels) { this.labels = labels; } public void addTo(final TextBlock tb) { addLabelsTo(tb); } protected final void addLabelsTo(final TextBlock tb) { tb.addLabels(labels); } public String toString() { return super.toString()+"{"+Arrays.asList(labels)+"}"; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/package.html100644 0 0 107 11502365055 20172 0ustar 0 0

The Boilerpipe top-level package.

boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java100644 0 0 24052 11502365056 25025 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.sax; import java.util.ArrayList; import java.util.BitSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.labels.LabelAction; import de.l3s.boilerpipe.util.UnicodeTokenizer; /** * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can * be used by different parser implementations, e.g. NekoHTML and TagSoup. * * @author Christian Kohlschütter */ public class BoilerpipeHTMLContentHandler implements ContentHandler { private final Map tagActions; private String title = null; static final String ANCHOR_TEXT_START = "$\ue00a<"; static final String ANCHOR_TEXT_END = ">\ue00a$"; StringBuilder tokenBuffer = new StringBuilder(); StringBuilder textBuffer = new StringBuilder(); int inBody = 0; int inAnchor = 0; int inIgnorableElement = 0; int tagLevel = 0; int blockTagLevel = -1; boolean sbLastWasWhitespace = false; private int textElementIdx = 0; private final List textBlocks = new ArrayList(); private String lastStartTag = null; @SuppressWarnings("unused") private String lastEndTag = null; @SuppressWarnings("unused") private Event lastEvent = null; private int offsetBlocks = 0; private BitSet currentContainedTextElements = new BitSet(); private boolean flush = false; boolean inAnchorText = false; LinkedList> labelStacks = new LinkedList>(); LinkedList fontSizeStack = new LinkedList(); /** * Recycles this instance. */ public void recycle() { tokenBuffer.setLength(0); textBuffer.setLength(0); inBody = 0; inAnchor = 0; inIgnorableElement = 0; sbLastWasWhitespace = false; textElementIdx = 0; textBlocks.clear(); lastStartTag = null; lastEndTag = null; lastEvent = null; offsetBlocks = 0; currentContainedTextElements.clear(); flush = false; inAnchorText = false; } /** * Constructs a {@link BoilerpipeHTMLContentHandler} using the * {@link DefaultTagActionMap}. */ public BoilerpipeHTMLContentHandler() { this(DefaultTagActionMap.INSTANCE); } /** * Constructs a {@link BoilerpipeHTMLContentHandler} using the given * {@link TagActionMap}. * * @param tagActions * The {@link TagActionMap} to use, e.g. * {@link DefaultTagActionMap}. */ public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) { this.tagActions = tagActions; } // @Override public void endDocument() throws SAXException { flushBlock(); } // @Override public void endPrefixMapping(String prefix) throws SAXException { } // @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { if (!sbLastWasWhitespace) { textBuffer.append(' '); tokenBuffer.append(' '); } sbLastWasWhitespace = true; } // @Override public void processingInstruction(String target, String data) throws SAXException { } // @Override public void setDocumentLocator(Locator locator) { } // @Override public void skippedEntity(String name) throws SAXException { } // @Override public void startDocument() throws SAXException { } // @Override public void startPrefixMapping(String prefix, String uri) throws SAXException { } // @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { labelStacks.add(null); TagAction ta = tagActions.get(localName); if (ta != null) { if(ta.changesTagLevel()) { tagLevel++; } flush = ta.start(this, localName, qName, atts) | flush; } else { tagLevel++; flush = true; } lastEvent = Event.START_TAG; lastStartTag = localName; } // @Override public void endElement(String uri, String localName, String qName) throws SAXException { TagAction ta = tagActions.get(localName); if (ta != null) { flush = ta.end(this, localName, qName) | flush; } else { flush = true; } if(ta == null || ta.changesTagLevel()) { tagLevel--; } if (flush) { flushBlock(); } lastEvent = Event.END_TAG; lastEndTag = localName; labelStacks.removeLast(); } // @Override public void characters(char[] ch, int start, int length) throws SAXException { textElementIdx++; if (flush) { flushBlock(); flush = false; } if (inIgnorableElement != 0) { return; } char c; boolean startWhitespace = false; boolean endWhitespace = false; if (length == 0) { return; } final int end = start + length; for (int i = start; i < end; i++) { if (Character.isWhitespace(ch[i])) { ch[i] = ' '; } } while (start < end) { c = ch[start]; if (c == ' ') { startWhitespace = true; start++; length--; } else { break; } } while (length > 0) { c = ch[start + length - 1]; if (c == ' ') { endWhitespace = true; length--; } else { break; } } if (length == 0) { if (startWhitespace || endWhitespace) { if (!sbLastWasWhitespace) { textBuffer.append(' '); tokenBuffer.append(' '); } sbLastWasWhitespace = true; } else { sbLastWasWhitespace = false; } lastEvent = Event.WHITESPACE; return; } if (startWhitespace) { if (!sbLastWasWhitespace) { textBuffer.append(' '); tokenBuffer.append(' '); } } if (blockTagLevel == -1) { blockTagLevel = tagLevel; } textBuffer.append(ch, start, length); tokenBuffer.append(ch, start, length); if (endWhitespace) { textBuffer.append(' '); tokenBuffer.append(' '); } sbLastWasWhitespace = endWhitespace; lastEvent = Event.CHARACTERS; currentContainedTextElements.set(textElementIdx); } List getTextBlocks() { return textBlocks; } public void flushBlock() { if (inBody == 0) { if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) { setTitle(tokenBuffer.toString().trim()); } textBuffer.setLength(0); tokenBuffer.setLength(0); return; } final int length = tokenBuffer.length(); switch (length) { case 0: return; case 1: if (sbLastWasWhitespace) { textBuffer.setLength(0); tokenBuffer.setLength(0); return; } } final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer); int numWords = 0; int numLinkedWords = 0; int numWrappedLines = 0; int currentLineLength = -1; // don't count the first space final int maxLineLength = 80; int numTokens = 0; int numWordsCurrentLine = 0; for (String token : tokens) { if (ANCHOR_TEXT_START.equals(token)) { inAnchorText = true; } else if (ANCHOR_TEXT_END.equals(token)) { inAnchorText = false; } else if (isWord(token)) { numTokens++; numWords++; numWordsCurrentLine++; if (inAnchorText) { numLinkedWords++; } final int tokenLength = token.length(); currentLineLength += tokenLength + 1; if (currentLineLength > maxLineLength) { numWrappedLines++; currentLineLength = tokenLength; numWordsCurrentLine = 1; } } else { numTokens++; } } if (numTokens == 0) { return; } int numWordsInWrappedLines; if (numWrappedLines == 0) { numWordsInWrappedLines = numWords; numWrappedLines = 1; } else { numWordsInWrappedLines = numWords - numWordsCurrentLine; } TextBlock tb = new TextBlock(textBuffer.toString().trim(), currentContainedTextElements, numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, offsetBlocks); currentContainedTextElements = new BitSet(); offsetBlocks++; textBuffer.setLength(0); tokenBuffer.setLength(0); tb.setTagLevel(blockTagLevel); addTextBlock(tb); blockTagLevel = -1; } protected void addTextBlock(final TextBlock tb) { for (Integer l : fontSizeStack) { if (l != null) { tb.addLabel("font-" + l); break; } } for (LinkedList labelStack : labelStacks) { if (labelStack != null) { for (LabelAction labels : labelStack) { if (labels != null) { labels.addTo(tb); } } } } textBlocks.add(tb); } private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern .compile("[\\p{L}\\p{Nd}\\p{Nl}\\p{No}]"); private static boolean isWord(final String token) { return PAT_VALID_WORD_CHARACTER.matcher(token).find(); } static private enum Event { START_TAG, END_TAG, CHARACTERS, WHITESPACE } public String getTitle() { return title; } public void setTitle(String s) { if (s == null || s.length() == 0) { return; } title = s; } /** * Returns a {@link TextDocument} containing the extracted {@link TextBlock} * s. NOTE: Only call this after parsing. * * @return The {@link TextDocument} */ public TextDocument toTextDocument() { // just to be sure flushBlock(); return new TextDocument(getTitle(), getTextBlocks()); } public void addWhitespaceIfNecessary() { if (!sbLastWasWhitespace) { tokenBuffer.append(' '); textBuffer.append(' '); sbLastWasWhitespace = true; } } public void addLabelAction(final LabelAction la) throws IllegalStateException { LinkedList labelStack = labelStacks.getLast(); if (labelStack == null) { labelStack = new LinkedList(); labelStacks.removeLast(); labelStacks.add(labelStack); } labelStack.add(la); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLParser.java100644 0 0 5043 11502365056 23330 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.sax; import org.apache.xerces.parsers.AbstractSAXParser; import org.cyberneko.html.HTMLConfiguration; import de.l3s.boilerpipe.BoilerpipeDocumentSource; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * A simple SAX Parser, used by {@link BoilerpipeSAXInput}. The parser uses CyberNeko to parse HTML content. * * @author Christian Kohlschütter */ public class BoilerpipeHTMLParser extends AbstractSAXParser implements BoilerpipeDocumentSource { private BoilerpipeHTMLContentHandler contentHandler; /** * Constructs a {@link BoilerpipeHTMLParser} using a default HTML content handler. */ public BoilerpipeHTMLParser() { this(new BoilerpipeHTMLContentHandler()); } /** * Constructs a {@link BoilerpipeHTMLParser} using the given {@link BoilerpipeHTMLContentHandler}. * * @param contentHandler */ public BoilerpipeHTMLParser(BoilerpipeHTMLContentHandler contentHandler) { super(new HTMLConfiguration()); setContentHandler(contentHandler); } protected BoilerpipeHTMLParser(boolean ignore) { super(new HTMLConfiguration()); } public void setContentHandler(final BoilerpipeHTMLContentHandler contentHandler) { this.contentHandler = contentHandler; super.setContentHandler(contentHandler); } public void setContentHandler(final org.xml.sax.ContentHandler contentHandler) { this.contentHandler = null; super.setContentHandler(contentHandler); } /** * Returns a {@link TextDocument} containing the extracted {@link TextBlock} * s. NOTE: Only call this after {@link #parse(org.xml.sax.InputSource)}. * * @return The {@link TextDocument} */ public TextDocument toTextDocument() { return contentHandler.toTextDocument(); } }boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/BoilerpipeSAXInput.java100644 0 0 4475 11502365056 23072 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.sax; import java.io.IOException; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import de.l3s.boilerpipe.BoilerpipeInput; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextDocument; /** * Parses an {@link InputSource} using SAX and returns a {@link TextDocument}. * * @author Christian Kohlschütter */ public final class BoilerpipeSAXInput implements BoilerpipeInput { private final InputSource is; /** * Creates a new instance of {@link BoilerpipeSAXInput} for the given {@link InputSource}. * * @param is * @throws SAXException */ public BoilerpipeSAXInput(final InputSource is) throws SAXException { this.is = is; } /** * Retrieves the {@link TextDocument} using a default HTML parser. */ public TextDocument getTextDocument() throws BoilerpipeProcessingException { return getTextDocument(new BoilerpipeHTMLParser()); } /** * Retrieves the {@link TextDocument} using the given HTML parser. * * @param parser The parser used to transform the input into boilerpipe's internal representation. * @return The retrieved {@link TextDocument} * @throws BoilerpipeProcessingException */ public TextDocument getTextDocument(final BoilerpipeHTMLParser parser) throws BoilerpipeProcessingException { try { parser.parse(is); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } return parser.toTextDocument(); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/CommonTagActions.java100644 0 0 27760 11502365056 22633 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009, 2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.sax; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.labels.LabelAction; /** * Defines an action that is to be performed whenever a particular tag occurs during HTML parsing. * * @author Christian Kohlschütter */ public abstract class CommonTagActions { private CommonTagActions() { } public static final class Chained implements TagAction { private final TagAction t1; private final TagAction t2; public Chained(final TagAction t1, final TagAction t2) { this.t1 = t1; this.t2 = t2; } public boolean start(BoilerpipeHTMLContentHandler instance, String localName, String qName, Attributes atts) throws SAXException { return t1.start(instance, localName, qName, atts) | t2.start(instance, localName, qName, atts); } public boolean end(BoilerpipeHTMLContentHandler instance, String localName, String qName) throws SAXException { return t1.end(instance, localName, qName) | t2.end(instance, localName, qName); } public boolean changesTagLevel() { return t1.changesTagLevel() || t2.changesTagLevel(); } } /** * Marks this tag as "ignorable", i.e. all its inner content is silently skipped. */ public static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() { public boolean start(final BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.inIgnorableElement++; return true; } public boolean end(final BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { instance.inIgnorableElement--; return true; } public boolean changesTagLevel() { return true; } }; /** * Marks this tag as "anchor" (this should usually only be set for the <A> tag). * Anchor tags may not be nested. * * There is a bug in certain versions of NekoHTML which still allows nested tags. * If boilerpipe encounters such nestings, a SAXException is thrown. */ public static final TagAction TA_ANCHOR_TEXT = new TagAction() { public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) throws SAXException { if (instance.inAnchor++ > 0) { // as nested A elements are not allowed per specification, we // are probably reaching this branch due to a bug in the XML // parser System.err.println("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."); end(instance, localName, qName); } if (instance.inIgnorableElement == 0) { instance.addWhitespaceIfNecessary(); instance.tokenBuffer .append(BoilerpipeHTMLContentHandler.ANCHOR_TEXT_START); instance.tokenBuffer.append(' '); instance.sbLastWasWhitespace = true; } return false; } public boolean end(BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { if (--instance.inAnchor == 0) { if (instance.inIgnorableElement == 0) { instance.addWhitespaceIfNecessary(); instance.tokenBuffer .append(BoilerpipeHTMLContentHandler.ANCHOR_TEXT_END); instance.tokenBuffer.append(' '); instance.sbLastWasWhitespace = true; } } return false; } public boolean changesTagLevel() { return true; } }; /** * Marks this tag the body element (this should usually only be set for the <BODY> tag). */ public static final TagAction TA_BODY = new TagAction() { public boolean start(final BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.flushBlock(); instance.inBody++; return false; } public boolean end(final BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { instance.flushBlock(); instance.inBody--; return false; } public boolean changesTagLevel() { return true; } }; /** * Marks this tag a simple "inline" element, which generates whitespace, but no new block. */ public static final TagAction TA_INLINE_WHITESPACE = new TagAction() { public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.addWhitespaceIfNecessary(); return false; } public boolean end(BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { instance.addWhitespaceIfNecessary(); return false; } public boolean changesTagLevel() { return false; } }; /** * @deprecated Use {@link #TA_INLINE_WHITESPACE} instead */ @Deprecated public static final TagAction TA_INLINE = TA_INLINE_WHITESPACE; /** * Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block. */ public static final TagAction TA_INLINE_NO_WHITESPACE = new TagAction() { public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { return false; } public boolean end(BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { return false; } public boolean changesTagLevel() { return false; } }; private static final Pattern PAT_FONT_SIZE = Pattern .compile("([\\+\\-]?)([0-9])"); /** * Explicitly marks this tag a simple "block-level" element, which always generates whitespace */ public static final TagAction TA_BLOCK_LEVEL = new TagAction() { public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { return true; } public boolean end(BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { return true; } public boolean changesTagLevel() { return true; } }; /** * Special TagAction for the <FONT> tag, which keeps track of the * absolute and relative font size. */ public static final TagAction TA_FONT = new TagAction() { public boolean start(final BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { String sizeAttr = atts.getValue("size"); if (sizeAttr != null) { Matcher m = PAT_FONT_SIZE.matcher(sizeAttr); if (m.matches()) { String rel = m.group(1); final int val = Integer.parseInt(m.group(2)); final int size; if (rel.length() == 0) { // absolute size = val; } else { // relative int prevSize; if (instance.fontSizeStack.isEmpty()) { prevSize = 3; } else { prevSize = 3; for (Integer s : instance.fontSizeStack) { if (s != null) { prevSize = s; break; } } } if (rel.charAt(0) == '+') { size = prevSize + val; } else { size = prevSize - val; } } instance.fontSizeStack.add(0, size); } else { instance.fontSizeStack.add(0, null); } } else { instance.fontSizeStack.add(0, null); } return false; } public boolean end(final BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { instance.fontSizeStack.removeFirst(); return false; } public boolean changesTagLevel() { return false; } }; /** * {@link CommonTagActions} for inline elements, which triggers some {@link LabelAction} on the generated * {@link TextBlock}. */ public static final class InlineTagLabelAction implements TagAction { private final LabelAction action; public InlineTagLabelAction(final LabelAction action) { this.action = action; } public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.addWhitespaceIfNecessary(); instance.addLabelAction(action); return false; } public boolean end(BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { instance.addWhitespaceIfNecessary(); return false; } public boolean changesTagLevel() { return false; } } /** * {@link CommonTagActions} for block-level elements, which triggers some {@link LabelAction} on the generated * {@link TextBlock}. */ public static final class BlockTagLabelAction implements TagAction { private final LabelAction action; public BlockTagLabelAction(final LabelAction action) { this.action = action; } public boolean start(BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) { instance.addLabelAction(action); return true; } public boolean end(BoilerpipeHTMLContentHandler instance, final String localName, final String qName) { return true; } public boolean changesTagLevel() { return true; } } }boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/DefaultTagActionMap.java100644 0 0 5624 11502365056 23215 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009, 2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.sax; /** * Default {@link TagAction}s. Seem to work well. * * @see TagActionMap */ public class DefaultTagActionMap extends TagActionMap { /** * */ private static final long serialVersionUID = 1L; public static final TagActionMap INSTANCE = new DefaultTagActionMap(); protected DefaultTagActionMap() { setTagAction("STYLE", CommonTagActions.TA_IGNORABLE_ELEMENT); setTagAction("SCRIPT", CommonTagActions.TA_IGNORABLE_ELEMENT); setTagAction("OPTION", CommonTagActions.TA_IGNORABLE_ELEMENT); setTagAction("OBJECT", CommonTagActions.TA_IGNORABLE_ELEMENT); setTagAction("EMBED", CommonTagActions.TA_IGNORABLE_ELEMENT); setTagAction("APPLET", CommonTagActions.TA_IGNORABLE_ELEMENT); setTagAction("LINK", CommonTagActions.TA_IGNORABLE_ELEMENT); setTagAction("A", CommonTagActions.TA_ANCHOR_TEXT); setTagAction("BODY", CommonTagActions.TA_BODY); setTagAction("STRIKE", CommonTagActions.TA_INLINE_NO_WHITESPACE); setTagAction("U", CommonTagActions.TA_INLINE_NO_WHITESPACE); setTagAction("B", CommonTagActions.TA_INLINE_NO_WHITESPACE); setTagAction("I", CommonTagActions.TA_INLINE_NO_WHITESPACE); setTagAction("EM", CommonTagActions.TA_INLINE_NO_WHITESPACE); setTagAction("STRONG", CommonTagActions.TA_INLINE_NO_WHITESPACE); setTagAction("SPAN", CommonTagActions.TA_INLINE_NO_WHITESPACE); // New in 1.1 (especially to improve extraction quality from Wikipedia etc.) setTagAction("SUP", CommonTagActions.TA_INLINE_NO_WHITESPACE); // New in 1.2 setTagAction("CODE", CommonTagActions.TA_INLINE_NO_WHITESPACE); setTagAction("TT", CommonTagActions.TA_INLINE_NO_WHITESPACE); setTagAction("SUB", CommonTagActions.TA_INLINE_NO_WHITESPACE); setTagAction("VAR", CommonTagActions.TA_INLINE_NO_WHITESPACE); setTagAction("ABBR", CommonTagActions.TA_INLINE_WHITESPACE); setTagAction("ACRONYM", CommonTagActions.TA_INLINE_WHITESPACE); setTagAction("FONT", CommonTagActions.TA_INLINE_NO_WHITESPACE); // could also use TA_FONT // added in 1.1.1 setTagAction("NOSCRIPT", CommonTagActions.TA_IGNORABLE_ELEMENT); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/HTMLDocument.java100644 0 0 1555 11502365056 21643 0ustar 0 0 package de.l3s.boilerpipe.sax; import java.io.ByteArrayInputStream; import java.nio.charset.Charset; import org.xml.sax.InputSource; /** * An {@link InputSourceable} for {@link HTMLFetcher}. * * @author Christian Kohlschütter */ public class HTMLDocument implements InputSourceable { private final Charset charset; private final byte[] data; public HTMLDocument(final byte[] data, final Charset charset) { this.data = data; this.charset = charset; } public HTMLDocument(final String data) { Charset cs = Charset.forName("utf-8"); this.data = data.getBytes(cs); this.charset = cs; } public Charset getCharset() { return charset; } public byte[] getData() { return data; } public InputSource toInputSource() { final InputSource is = new InputSource(new ByteArrayInputStream(data)); is.setEncoding(charset.name()); return is; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/HTMLFetcher.java100644 0 0 3351 11502365056 21441 0ustar 0 0 package de.l3s.boilerpipe.sax; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.net.URLConnection; import java.nio.charset.Charset; import java.nio.charset.UnsupportedCharsetException; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; /** * A very simple HTTP/HTML fetcher, really just for demo purposes. * * @author Christian Kohlschütter */ public class HTMLFetcher { private HTMLFetcher() { } private static final Pattern PAT_CHARSET = Pattern.compile("charset=([^; ]+)$"); /** * Fetches the document at the given URL, using {@link URLConnection}. * @param url * @return * @throws IOException */ public static HTMLDocument fetch(final URL url) throws IOException { final URLConnection conn = url.openConnection(); final String ct = conn.getContentType(); Charset cs = Charset.forName("Cp1252"); if (ct != null) { Matcher m = PAT_CHARSET.matcher(ct); if(m.find()) { final String charset = m.group(1); try { cs = Charset.forName(charset); } catch (UnsupportedCharsetException e) { // keep default } } } InputStream in = conn.getInputStream(); final String encoding = conn.getContentEncoding(); if(encoding != null) { if("gzip".equalsIgnoreCase(encoding)) { in = new GZIPInputStream(in); } else { System.err.println("WARN: unsupported Content-Encoding: "+encoding); } } ByteArrayOutputStream bos = new ByteArrayOutputStream(); byte[] buf = new byte[4096]; int r; while ((r = in.read(buf)) != -1) { bos.write(buf, 0, r); } in.close(); final byte[] data = bos.toByteArray(); return new HTMLDocument(data, cs); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/HTMLHighlighter.java100644 0 0 31057 11502365056 22343 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.sax; import java.io.IOException; import java.io.StringReader; import java.net.URL; import java.util.BitSet; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.xerces.parsers.AbstractSAXParser; import org.cyberneko.html.HTMLConfiguration; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import de.l3s.boilerpipe.BoilerpipeExtractor; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.document.TextDocument; /** * Highlights text blocks in an HTML document that have been marked as "content" * in the corresponding {@link TextDocument}. * * @author Christian Kohlschütter */ public final class HTMLHighlighter { /** * Creates a new {@link HTMLHighlighter}, which is set-up to return the full * HTML text, with the extracted text portion highlighted. */ public static HTMLHighlighter newHighlightingInstance() { return new HTMLHighlighter(false); } /** * Creates a new {@link HTMLHighlighter}, which is set-up to return only the * extracted HTML text, including enclosed markup. */ public static HTMLHighlighter newExtractingInstance() { return new HTMLHighlighter(true); } private HTMLHighlighter(final boolean extractHTML) { if (extractHTML) { setOutputHighlightOnly(true); setExtraStyleSheet("\n\n"); setPreHighlight(""); setPostHighlight(""); } } /** * Processes the given {@link TextDocument} and the original HTML text (as a * String). * * @param doc * The processed {@link TextDocument}. * @param origHTML * The original HTML document. * @throws BoilerpipeProcessingException */ public String process(final TextDocument doc, final String origHTML) throws BoilerpipeProcessingException { return process(doc, new InputSource(new StringReader(origHTML))); } /** * Processes the given {@link TextDocument} and the original HTML text (as * an {@link InputSource}). * * @param doc * The processed {@link TextDocument}. * @param is * The original HTML document. * @throws BoilerpipeProcessingException */ public String process(final TextDocument doc, final InputSource is) throws BoilerpipeProcessingException { final Implementation implementation = new Implementation(); implementation.process(doc, is); String html = implementation.html.toString(); if(outputHighlightOnly) { Matcher m; boolean repeat = true; while(repeat) { repeat = false; m = PAT_TAG_NO_TEXT.matcher(html); if(m.find()) { repeat = true; html = m.replaceAll(""); } m = PAT_SUPER_TAG.matcher(html); if(m.find()) { repeat = true; html = m.replaceAll(m.group(1)); } } } return html; } private static final Pattern PAT_TAG_NO_TEXT = Pattern.compile("<[^/][^>]*>]*>"); private static final Pattern PAT_SUPER_TAG = Pattern.compile("^<[^>]*>(<.*?>)]*>$"); public String process(final URL url, final BoilerpipeExtractor extractor) throws IOException, BoilerpipeProcessingException, SAXException { final HTMLDocument htmlDoc = HTMLFetcher.fetch(url); final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()) .getTextDocument(); extractor.process(doc); final InputSource is = htmlDoc.toInputSource(); return process(doc, is); } private boolean outputHighlightOnly = false; private String extraStyleSheet = "\n\n"; private String preHighlight = ""; private String postHighlight = ""; /** * If true, only HTML enclosed within highlighted content will be returned */ public boolean isOutputHighlightOnly() { return outputHighlightOnly; } /** * Sets whether only HTML enclosed within highlighted content will be * returned, or the whole HTML document. */ public void setOutputHighlightOnly(boolean outputHighlightOnly) { this.outputHighlightOnly = outputHighlightOnly; } /** * Returns the extra stylesheet definition that will be inserted in the HEAD * element. * * By default, this corresponds to a simple definition that marks text in * class "x-boilerpipe-mark1" as inline text with yellow background. */ public String getExtraStyleSheet() { return extraStyleSheet; } /** * Sets the extra stylesheet definition that will be inserted in the HEAD * element. * * To disable, set it to the empty string: "" * * @param extraStyleSheet * Plain HTML */ public void setExtraStyleSheet(String extraStyleSheet) { this.extraStyleSheet = extraStyleSheet; } /** * Returns the string that will be inserted before any highlighted HTML * block. * * By default, this corresponds to * <span class=&qupt;x-boilerpipe-mark1"> */ public String getPreHighlight() { return preHighlight; } /** * Sets the string that will be inserted prior to any highlighted HTML * block. * * To disable, set it to the empty string: "" */ public void setPreHighlight(String preHighlight) { this.preHighlight = preHighlight; } /** * Returns the string that will be inserted after any highlighted HTML * block. * * By default, this corresponds to </span> */ public String getPostHighlight() { return postHighlight; } /** * Sets the string that will be inserted after any highlighted HTML block. * * To disable, set it to the empty string: "" */ public void setPostHighlight(String postHighlight) { this.postHighlight = postHighlight; } private abstract static class TagAction { void beforeStart(final Implementation instance, final String localName) { } void afterStart(final Implementation instance, final String localName) { } void beforeEnd(final Implementation instance, final String localName) { } void afterEnd(final Implementation instance, final String localName) { } } private static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() { void beforeStart(final Implementation instance, final String localName) { instance.inIgnorableElement++; } void afterEnd(final Implementation instance, final String localName) { instance.inIgnorableElement--; } }; private static final TagAction TA_HEAD = new TagAction() { void beforeStart(final Implementation instance, final String localName) { instance.inIgnorableElement++; } void beforeEnd(final Implementation instance, String localName) { instance.html.append(instance.hl.extraStyleSheet); } void afterEnd(final Implementation instance, final String localName) { instance.inIgnorableElement--; } }; private static Map TAG_ACTIONS = new HashMap(); static { TAG_ACTIONS.put("STYLE", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("SCRIPT", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("OPTION", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("NOSCRIPT", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("OBJECT", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("EMBED", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("APPLET", TA_IGNORABLE_ELEMENT); // NOTE: you might want to comment this out: TAG_ACTIONS.put("LINK", TA_IGNORABLE_ELEMENT); TAG_ACTIONS.put("HEAD", TA_HEAD); } private final class Implementation extends AbstractSAXParser implements ContentHandler { StringBuilder html = new StringBuilder(); private int inIgnorableElement = 0; private int characterElementIdx = 0; private final BitSet contentBitSet = new BitSet(); private final HTMLHighlighter hl = HTMLHighlighter.this; Implementation() { super(new HTMLConfiguration()); setContentHandler(this); } void process(final TextDocument doc, final InputSource is) throws BoilerpipeProcessingException { for (TextBlock block : doc.getTextBlocks()) { if (block.isContent()) { final BitSet bs = block.getContainedTextElements(); if (bs != null) { contentBitSet.or(bs); } } } try { parse(is); } catch (SAXException e) { throw new BoilerpipeProcessingException(e); } catch (IOException e) { throw new BoilerpipeProcessingException(e); } } public void endDocument() throws SAXException { } public void endPrefixMapping(String prefix) throws SAXException { } public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { } public void processingInstruction(String target, String data) throws SAXException { } public void setDocumentLocator(Locator locator) { } public void skippedEntity(String name) throws SAXException { } public void startDocument() throws SAXException { } public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { TagAction ta = TAG_ACTIONS.get(localName); if (ta != null) { ta.beforeStart(this, localName); } // HACK: remove existing highlight boolean ignoreAttrs = false; if ("SPAN".equalsIgnoreCase(localName)) { String classVal = atts.getValue("class"); if ("x-boilerpipe-mark1".equals(classVal)) { ignoreAttrs = true; } } try { if (inIgnorableElement == 0) { if (outputHighlightOnly) { // boolean highlight = contentBitSet // .get(characterElementIdx); // if (!highlight) { // return; // } } html.append('<'); html.append(qName); if (!ignoreAttrs) { final int numAtts = atts.getLength(); for (int i = 0; i < numAtts; i++) { final String attr = atts.getQName(i); final String value = atts.getValue(i); html.append(' '); html.append(attr); html.append("=\""); html.append(xmlEncode(value)); html.append("\""); } } html.append('>'); } } finally { if (ta != null) { ta.afterStart(this, localName); } } } public void endElement(String uri, String localName, String qName) throws SAXException { TagAction ta = TAG_ACTIONS.get(localName); if (ta != null) { ta.beforeEnd(this, localName); } try { if (inIgnorableElement == 0) { if (outputHighlightOnly) { // boolean highlight = contentBitSet // .get(characterElementIdx); // if (!highlight) { // return; // } } html.append("'); } } finally { if (ta != null) { ta.afterEnd(this, localName); } } } public void characters(char[] ch, int start, int length) throws SAXException { characterElementIdx++; if (inIgnorableElement == 0) { boolean highlight = contentBitSet.get(characterElementIdx); if (!highlight && outputHighlightOnly) { return; } if (highlight) { html.append(preHighlight); } html.append(xmlEncode(String.valueOf(ch, start, length))); if (highlight) { html.append(postHighlight); } } } public void startPrefixMapping(String prefix, String uri) throws SAXException { } } private static String xmlEncode(final String in) { if (in == null) { return ""; } char c; StringBuilder out = new StringBuilder(in.length()); for (int i = 0; i < in.length(); i++) { c = in.charAt(i); switch (c) { case '<': out.append("<"); break; case '>': out.append(">"); break; case '&': out.append("&"); break; case '"': out.append("""); break; default: out.append(c); } } return out.toString(); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/InputSourceable.java100644 0 0 432 11502365056 22455 0ustar 0 0 package de.l3s.boilerpipe.sax; import org.xml.sax.InputSource; /** * An InputSourceable can return an arbitrary number of new {@link InputSource}s for a given document. * * @author Christian Kohlschütter */ public interface InputSourceable { InputSource toInputSource(); } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/MarkupTagAction.java100644 0 0 5507 11502365056 22432 0ustar 0 0 package de.l3s.boilerpipe.sax; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import de.l3s.boilerpipe.document.TextBlock; import de.l3s.boilerpipe.labels.DefaultLabels; import de.l3s.boilerpipe.labels.LabelAction; /** * Assigns labels for element CSS classes and ids to the corresponding * {@link TextBlock}. CSS classes are prefixed by * {@link DefaultLabels#MARKUP_PREFIX}., and IDs are prefixed by * {@link DefaultLabels#MARKUP_PREFIX}# * * @author Christian Kohlschütter */ public final class MarkupTagAction implements TagAction { private final boolean isBlockLevel; private LinkedList> labelStack = new LinkedList>(); public MarkupTagAction(final boolean isBlockLevel) { this.isBlockLevel = isBlockLevel; } private static final Pattern PAT_NUM = Pattern.compile("[0-9]+"); @Override public boolean start(BoilerpipeHTMLContentHandler instance, String localName, String qName, Attributes atts) throws SAXException { List labels = new ArrayList(5); labels.add(DefaultLabels.MARKUP_PREFIX + localName); String classVal = atts.getValue("class"); if (classVal != null && classVal.length() > 0) { classVal = PAT_NUM.matcher(classVal).replaceAll("#"); classVal = classVal.trim(); String[] vals = classVal.split("[ ]+"); labels.add(DefaultLabels.MARKUP_PREFIX + "." + classVal.replace(' ', '.')); if (vals.length > 1) { for (String s : vals) { labels.add(DefaultLabels.MARKUP_PREFIX + "." + s); } } } String id = atts.getValue("id"); if (id != null && id.length() > 0) { id = PAT_NUM.matcher(id).replaceAll("#"); labels.add(DefaultLabels.MARKUP_PREFIX + "#" + id); } Set ancestors = getAncestorLabels(); List labelsWithAncestors = new ArrayList( (ancestors.size() + 1) * labels.size()); for (String l : labels) { for (String an : ancestors) { labelsWithAncestors.add(an); labelsWithAncestors.add(an + " " + l); } labelsWithAncestors.add(l); } instance.addLabelAction(new LabelAction(labelsWithAncestors .toArray(new String[labelsWithAncestors.size()]))); labelStack.add(labels); return isBlockLevel; } @Override public boolean end(BoilerpipeHTMLContentHandler instance, String localName, String qName) throws SAXException { labelStack.removeLast(); return isBlockLevel; } public boolean changesTagLevel() { return isBlockLevel; } private Set getAncestorLabels() { Set set = new HashSet(); for (List labels : labelStack) { if (labels == null) { continue; } set.addAll(labels); } return set; } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/TagAction.java100644 0 0 2344 11502365056 21246 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009, 2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.sax; import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** * Defines an action that is to be performed whenever a particular tag occurs * during HTML parsing. * * @author Christian Kohlschütter */ public interface TagAction { boolean start(final BoilerpipeHTMLContentHandler instance, final String localName, final String qName, final Attributes atts) throws SAXException; boolean end(final BoilerpipeHTMLContentHandler instance, final String localName, final String qName) throws SAXException; boolean changesTagLevel(); }boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/TagActionMap.java100644 0 0 4230 11502365056 21700 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009, 2010 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.sax; import java.util.HashMap; /** * Base class for definition a set of {@link TagAction}s that are to be used for the * HTML parsing process. * * @see DefaultTagActionMap * @author Christian Kohlschütter */ public abstract class TagActionMap extends HashMap { private static final long serialVersionUID = 1L; /** * Sets a particular {@link TagAction} for a given tag. Any existing TagAction for that tag * will be removed and overwritten. * * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case) * @param action The {@link TagAction} */ protected void setTagAction(final String tag, final TagAction action) { put(tag.toUpperCase(), action); put(tag.toLowerCase(), action); put(tag, action); } /** * Adds a particular {@link TagAction} for a given tag. If a TagAction already exists for that tag, * a chained action, consisting of the previous and the new {@link TagAction} is created. * * @param tag The tag (will be stored internally 1. as it is, 2. lower-case, 3. upper-case) * @param action The {@link TagAction} */ protected void addTagAction(final String tag, final TagAction action) { TagAction previousAction = get(tag); if(previousAction == null) { setTagAction(tag, action); } else { setTagAction(tag, new CommonTagActions.Chained(previousAction, action)); } } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/sax/package.html100644 0 0 165 11502365056 20772 0ustar 0 0

Classes related to parsing and producing HTML from/to Boilerpipe TextDocuments.

boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/util/UnicodeTokenizer.java100644 0 0 3027 11502365056 23037 0ustar 0 0 /** * boilerpipe * * Copyright (c) 2009 Christian Kohlschütter * * The author licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.l3s.boilerpipe.util; import java.util.regex.Pattern; /** * Tokenizes text according to Unicode word boundaries and strips off non-word * characters. * * @author Christian Kohlschütter */ public class UnicodeTokenizer { private static final Pattern PAT_WORD_BOUNDARY = Pattern.compile("\\b"); private static final Pattern PAT_NOT_WORD_BOUNDARY = Pattern .compile("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)/])[\u2063]*"); /** * Tokenizes the text and returns an array of tokens. * * @param text The text * @return The tokens */ public static String[] tokenize(final CharSequence text) { return PAT_NOT_WORD_BOUNDARY.matcher( PAT_WORD_BOUNDARY.matcher(text).replaceAll("\u2063")) .replaceAll("$1").replaceAll("[ \u2063]+", " ").trim().split( "[ ]+"); } } boilerpipe-1.2.0/src/main/de/l3s/boilerpipe/util/package.html100644 0 0 72 11502365056 21131 0ustar 0 0

Some helper classes.

boilerpipe-1.2.0/src/main/org/cyberneko/html/HTMLElements.java100644 0 0 74722 11502365056 21365 0ustar 0 0 /* * Copyright 2002-2009 Andy Clark, Marc Guillemot * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cyberneko.html; /** * Collection of HTML element information. * * @author Andy Clark * @author Ahmed Ashour * @author Marc Guillemot * * @version $Id: HTMLElements.java,v 1.12 2005/02/14 07:16:59 andyc Exp $ */ public class HTMLElements { // // Constants // // element codes // NOTE: The element codes *must* start with 0 and increment in // sequence. The parent and closes references depends on // this assumption. -Ac public static final short A = 0; public static final short ABBR = A+1; public static final short ACRONYM = ABBR+1; public static final short ADDRESS = ACRONYM+1; public static final short APPLET = ADDRESS+1; public static final short AREA = APPLET+1; public static final short B = AREA+1; public static final short BASE = B+1; public static final short BASEFONT = BASE+1; public static final short BDO = BASEFONT+1; public static final short BGSOUND = BDO+1; public static final short BIG = BGSOUND+1; public static final short BLINK = BIG+1; public static final short BLOCKQUOTE = BLINK+1; public static final short BODY = BLOCKQUOTE+1; public static final short BR = BODY+1; public static final short BUTTON = BR+1; public static final short CAPTION = BUTTON+1; public static final short CENTER = CAPTION+1; public static final short CITE = CENTER+1; public static final short CODE = CITE+1; public static final short COL = CODE+1; public static final short COLGROUP = COL+1; public static final short COMMENT = COLGROUP+1; public static final short DEL = COMMENT+1; public static final short DFN = DEL+1; public static final short DIR = DFN+1; public static final short DIV = DIR+1; public static final short DD = DIV+1; public static final short DL = DD+1; public static final short DT = DL+1; public static final short EM = DT+1; public static final short EMBED = EM+1; public static final short FIELDSET = EMBED+1; public static final short FONT = FIELDSET+1; public static final short FORM = FONT+1; public static final short FRAME = FORM+1; public static final short FRAMESET = FRAME+1; public static final short H1 = FRAMESET+1; public static final short H2 = H1+1; public static final short H3 = H2+1; public static final short H4 = H3+1; public static final short H5 = H4+1; public static final short H6 = H5+1; public static final short HEAD = H6+1; public static final short HR = HEAD+1; public static final short HTML = HR+1; public static final short I = HTML+1; public static final short IFRAME = I+1; public static final short ILAYER = IFRAME+1; public static final short IMG = ILAYER+1; public static final short INPUT = IMG+1; public static final short INS = INPUT+1; public static final short ISINDEX = INS+1; public static final short KBD = ISINDEX+1; public static final short KEYGEN = KBD+1; public static final short LABEL = KEYGEN+1; public static final short LAYER = LABEL+1; public static final short LEGEND = LAYER+1; public static final short LI = LEGEND+1; public static final short LINK = LI+1; public static final short LISTING = LINK+1; public static final short MAP = LISTING+1; public static final short MARQUEE = MAP+1; public static final short MENU = MARQUEE+1; public static final short META = MENU+1; public static final short MULTICOL = META+1; public static final short NEXTID = MULTICOL+1; public static final short NOBR = NEXTID+1; public static final short NOEMBED = NOBR+1; public static final short NOFRAMES = NOEMBED+1; public static final short NOLAYER = NOFRAMES+1; public static final short NOSCRIPT = NOLAYER+1; public static final short OBJECT = NOSCRIPT+1; public static final short OL = OBJECT+1; public static final short OPTION = OL+1; public static final short OPTGROUP = OPTION+1; public static final short P = OPTGROUP+1; public static final short PARAM = P+1; public static final short PLAINTEXT = PARAM+1; public static final short PRE = PLAINTEXT+1; public static final short Q = PRE+1; public static final short RB = Q+1; public static final short RBC = RB+1; public static final short RP = RBC+1; public static final short RT = RP+1; public static final short RTC = RT+1; public static final short RUBY = RTC+1; public static final short S = RUBY+1; public static final short SAMP = S+1; public static final short SCRIPT = SAMP+1; public static final short SELECT = SCRIPT+1; public static final short SMALL = SELECT+1; public static final short SOUND = SMALL+1; public static final short SPACER = SOUND+1; public static final short SPAN = SPACER+1; public static final short STRIKE = SPAN+1; public static final short STRONG = STRIKE+1; public static final short STYLE = STRONG+1; public static final short SUB = STYLE+1; public static final short SUP = SUB+1; public static final short TABLE = SUP+1; public static final short TBODY = TABLE+1; public static final short TD = TBODY+1; public static final short TEXTAREA = TD+1; public static final short TFOOT = TEXTAREA+1; public static final short TH = TFOOT+1; public static final short THEAD = TH+1; public static final short TITLE = THEAD+1; public static final short TR = TITLE+1; public static final short TT = TR+1; public static final short U = TT+1; public static final short UL = U+1; public static final short VAR = UL+1; public static final short WBR = VAR+1; public static final short XML = WBR+1; public static final short XMP = XML+1; public static final short UNKNOWN = XMP+1; // information /** Element information organized by first letter. */ protected static final Element[][] ELEMENTS_ARRAY = new Element[26][]; /** Element information as a contiguous list. */ protected static final ElementList ELEMENTS = new ElementList(); /** No such element. */ public static final Element NO_SUCH_ELEMENT = new Element(UNKNOWN, "", Element.CONTAINER, new short[]{BODY,HEAD}/*HTML*/, null); // // Static initializer // /** * Initializes the element information. *

* Note: * The getElement method requires that the HTML elements * are added to the list in alphabetical order. If new elements are * added, then they must be inserted in alphabetical order. */ static { // // // // // // // // // initialize array of element information ELEMENTS_ARRAY['A'-'A'] = new Element[] { // A - - (%inline;)* -(A) new Element(A, "A", Element.INLINE, BODY, new short[] {A}), // ABBR - - (%inline;)* new Element(ABBR, "ABBR", Element.INLINE, BODY, null), // ACRONYM - - (%inline;)* new Element(ACRONYM, "ACRONYM", Element.INLINE, BODY, null), // ADDRESS - - (%inline;)* new Element(ADDRESS, "ADDRESS", Element.BLOCK, BODY, null), // APPLET new Element(APPLET, "APPLET", 0, BODY, null), // AREA - O EMPTY new Element(AREA, "AREA", Element.EMPTY, MAP, null), }; ELEMENTS_ARRAY['B'-'A'] = new Element[] { // B - - (%inline;)* new Element(B, "B", Element.INLINE, BODY, null), // BASE - O EMPTY new Element(BASE, "BASE", Element.EMPTY, HEAD, null), // BASEFONT new Element(BASEFONT, "BASEFONT", 0, HEAD, null), // BDO - - (%inline;)* new Element(BDO, "BDO", Element.INLINE, BODY, null), // BGSOUND new Element(BGSOUND, "BGSOUND", Element.EMPTY, HEAD, null), // BIG - - (%inline;)* new Element(BIG, "BIG", Element.INLINE, BODY, null), // BLINK new Element(BLINK, "BLINK", Element.INLINE, BODY, null), // BLOCKQUOTE - - (%block;|SCRIPT)+ new Element(BLOCKQUOTE, "BLOCKQUOTE", Element.BLOCK, BODY, new short[]{P}), // BODY O O (%block;|SCRIPT)+ +(INS|DEL) new Element(BODY, "BODY", Element.CONTAINER, HTML, new short[]{HEAD}), // BR - O EMPTY new Element(BR, "BR", Element.EMPTY, BODY, null), // BUTTON - - (%flow;)* -(A|%formctrl;|FORM|FIELDSET) new Element(BUTTON, "BUTTON", 0, BODY, null), }; ELEMENTS_ARRAY['C'-'A'] = new Element[] { // CAPTION - - (%inline;)* new Element(CAPTION, "CAPTION", Element.INLINE, TABLE, null), // CENTER, new Element(CENTER, "CENTER", 0, BODY, null), // CITE - - (%inline;)* new Element(CITE, "CITE", Element.INLINE, BODY, null), // CODE - - (%inline;)* new Element(CODE, "CODE", Element.INLINE, BODY, null), // COL - O EMPTY new Element(COL, "COL", Element.EMPTY, TABLE, null), // COLGROUP - O (COL)* new Element(COLGROUP, "COLGROUP", 0, TABLE, new short[]{COL,COLGROUP}), // COMMENT new Element(COMMENT, "COMMENT", Element.SPECIAL, HTML, null), }; ELEMENTS_ARRAY['D'-'A'] = new Element[] { // DEL - - (%flow;)* new Element(DEL, "DEL", 0, BODY, null), // DFN - - (%inline;)* new Element(DFN, "DFN", Element.INLINE, BODY, null), // DIR new Element(DIR, "DIR", 0, BODY, null), // DIV - - (%flow;)* new Element(DIV, "DIV", Element.BLOCK, BODY, new short[]{P}), // DD - O (%flow;)* new Element(DD, "DD", 0, DL, new short[]{DT,DD}), // DL - - (DT|DD)+ new Element(DL, "DL", Element.BLOCK, BODY, null), // DT - O (%inline;)* new Element(DT, "DT", 0, DL, new short[]{DT,DD}), }; ELEMENTS_ARRAY['E'-'A'] = new Element[] { // EM - - (%inline;)* new Element(EM, "EM", Element.INLINE, BODY, null), // EMBED new Element(EMBED, "EMBED", 0, BODY, null), }; ELEMENTS_ARRAY['F'-'A'] = new Element[] { // FIELDSET - - (#PCDATA,LEGEND,(%flow;)*) new Element(FIELDSET, "FIELDSET", 0, BODY, null), // FONT new Element(FONT, "FONT", Element.CONTAINER, BODY, null), // FORM - - (%block;|SCRIPT)+ -(FORM) new Element(FORM, "FORM", Element.CONTAINER, new short[]{BODY,TD,DIV}, new short[]{BUTTON,P}), // FRAME - O EMPTY new Element(FRAME, "FRAME", Element.EMPTY, FRAMESET, null), // FRAMESET - - ((FRAMESET|FRAME)+ & NOFRAMES?) new Element(FRAMESET, "FRAMESET", 0, HTML, null), }; ELEMENTS_ARRAY['H'-'A'] = new Element[] { // (H1|H2|H3|H4|H5|H6) - - (%inline;)* new Element(H1, "H1", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), new Element(H2, "H2", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), new Element(H3, "H3", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), new Element(H4, "H4", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), new Element(H5, "H5", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), new Element(H6, "H6", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), // HEAD O O (%head.content;) +(%head.misc;) new Element(HEAD, "HEAD", 0, HTML, null), // HR - O EMPTY new Element(HR, "HR", Element.EMPTY, BODY, new short[]{P}), // HTML O O (%html.content;) new Element(HTML, "HTML", 0, null, null), }; ELEMENTS_ARRAY['I'-'A'] = new Element[] { // I - - (%inline;)* new Element(I, "I", Element.INLINE, BODY, null), // IFRAME new Element(IFRAME, "IFRAME", Element.BLOCK, BODY, null), // ILAYER new Element(ILAYER, "ILAYER", Element.BLOCK, BODY, null), // IMG - O EMPTY new Element(IMG, "IMG", Element.EMPTY, BODY, null), // INPUT - O EMPTY new Element(INPUT, "INPUT", Element.EMPTY, BODY, null), // INS - - (%flow;)* new Element(INS, "INS", 0, BODY, null), // ISINDEX new Element(ISINDEX, "ISINDEX", 0, HEAD, null), }; ELEMENTS_ARRAY['K'-'A'] = new Element[] { // KBD - - (%inline;)* new Element(KBD, "KBD", Element.INLINE, BODY, null), // KEYGEN new Element(KEYGEN, "KEYGEN", 0, BODY, null), }; ELEMENTS_ARRAY['L'-'A'] = new Element[] { // LABEL - - (%inline;)* -(LABEL) new Element(LABEL, "LABEL", 0, BODY, null), // LAYER new Element(LAYER, "LAYER", Element.BLOCK, BODY, null), // LEGEND - - (%inline;)* new Element(LEGEND, "LEGEND", Element.INLINE, FIELDSET, null), // LI - O (%flow;)* new Element(LI, "LI", 0, new short[]{BODY,UL,OL}, new short[]{LI}), // LINK - O EMPTY new Element(LINK, "LINK", Element.EMPTY, HEAD, null), // LISTING new Element(LISTING, "LISTING", 0, BODY, null), }; ELEMENTS_ARRAY['M'-'A'] = new Element[] { // MAP - - ((%block;) | AREA)+ new Element(MAP, "MAP", Element.INLINE, BODY, null), // MARQUEE new Element(MARQUEE, "MARQUEE", 0, BODY, null), // MENU new Element(MENU, "MENU", 0, BODY, null), // META - O EMPTY new Element(META, "META", Element.EMPTY, HEAD, new short[]{STYLE,TITLE}), // MULTICOL new Element(MULTICOL, "MULTICOL", 0, BODY, null), }; ELEMENTS_ARRAY['N'-'A'] = new Element[] { // NEXTID new Element(NEXTID, "NEXTID", Element.EMPTY, BODY, null), // NOBR new Element(NOBR, "NOBR", Element.INLINE, BODY, null), // NOEMBED new Element(NOEMBED, "NOEMBED", 0, BODY, null), // NOFRAMES - - (BODY) -(NOFRAMES) new Element(NOFRAMES, "NOFRAMES", 0, FRAMESET, null), // NOLAYER new Element(NOLAYER, "NOLAYER", 0, BODY, null), // NOSCRIPT - - (%block;)+ new Element(NOSCRIPT, "NOSCRIPT", 0, new short[]{BODY}, null), }; ELEMENTS_ARRAY['O'-'A'] = new Element[] { // OBJECT - - (PARAM | %flow;)* new Element(OBJECT, "OBJECT", 0, BODY, null), // OL - - (LI)+ new Element(OL, "OL", Element.BLOCK, BODY, null), // OPTGROUP - - (OPTION)+ new Element(OPTGROUP, "OPTGROUP", 0, SELECT, new short[]{OPTION}), // OPTION - O (#PCDATA) new Element(OPTION, "OPTION", 0, SELECT, new short[]{OPTION}), }; ELEMENTS_ARRAY['P'-'A'] = new Element[] { // P - O (%inline;)* new Element(P, "P", Element.CONTAINER, BODY, new short[]{P}), // PARAM - O EMPTY new Element(PARAM, "PARAM", Element.EMPTY, new short[]{OBJECT,APPLET}, null), // PLAINTEXT new Element(PLAINTEXT, "PLAINTEXT", Element.SPECIAL, BODY, null), // PRE - - (%inline;)* -(%pre.exclusion;) new Element(PRE, "PRE", 0, BODY, null), }; ELEMENTS_ARRAY['Q'-'A'] = new Element[] { // Q - - (%inline;)* new Element(Q, "Q", Element.INLINE, BODY, null), }; ELEMENTS_ARRAY['R'-'A'] = new Element[] { // RB new Element(RB, "RB", Element.INLINE, RUBY, new short[]{RB}), // RBC new Element(RBC, "RBC", 0, RUBY, null), // RP new Element(RP, "RP", Element.INLINE, RUBY, new short[]{RB}), // RT new Element(RT, "RT", Element.INLINE, RUBY, new short[]{RB,RP}), // RTC new Element(RTC, "RTC", 0, RUBY, new short[]{RBC}), // RUBY new Element(RUBY, "RUBY", 0, BODY, new short[]{RUBY}), }; ELEMENTS_ARRAY['S'-'A'] = new Element[] { // S new Element(S, "S", 0, BODY, null), // SAMP - - (%inline;)* new Element(SAMP, "SAMP", Element.INLINE, BODY, null), // SCRIPT - - %Script; new Element(SCRIPT, "SCRIPT", Element.SPECIAL, new short[]{HEAD,BODY}, null), // SELECT - - (OPTGROUP|OPTION)+ new Element(SELECT, "SELECT", Element.CONTAINER, BODY, new short[]{SELECT}), // SMALL - - (%inline;)* new Element(SMALL, "SMALL", Element.INLINE, BODY, null), // SOUND new Element(SOUND, "SOUND", Element.EMPTY, HEAD, null), // SPACER new Element(SPACER, "SPACER", Element.EMPTY, BODY, null), // SPAN - - (%inline;)* new Element(SPAN, "SPAN", Element.CONTAINER, BODY, null), // STRIKE new Element(STRIKE, "STRIKE", Element.INLINE, BODY, null), // STRONG - - (%inline;)* new Element(STRONG, "STRONG", Element.INLINE, BODY, null), // STYLE - - %StyleSheet; new Element(STYLE, "STYLE", Element.SPECIAL, new short[]{HEAD,BODY}, new short[]{STYLE,TITLE,META}), // SUB - - (%inline;)* new Element(SUB, "SUB", Element.INLINE, BODY, null), // SUP - - (%inline;)* new Element(SUP, "SUP", Element.INLINE, BODY, null), }; ELEMENTS_ARRAY['T'-'A'] = new Element[] { // TABLE - - (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+) new Element(TABLE, "TABLE", Element.BLOCK|Element.CONTAINER, BODY, null), // TBODY O O (TR)+ new Element(TBODY, "TBODY", 0, TABLE, new short[]{THEAD,TD,TH,TR,COLGROUP}), // TD - O (%flow;)* new Element(TD, "TD", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}), // TEXTAREA - - (#PCDATA) new Element(TEXTAREA, "TEXTAREA", Element.SPECIAL, BODY, null), // TFOOT - O (TR)+ new Element(TFOOT, "TFOOT", 0, TABLE, new short[]{THEAD,TBODY,TD,TH,TR}), // TH - O (%flow;)* new Element(TH, "TH", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}), // THEAD - O (TR)+ new Element(THEAD, "THEAD", 0, TABLE, new short[]{COLGROUP}), // TITLE - - (#PCDATA) -(%head.misc;) new Element(TITLE, "TITLE", Element.SPECIAL, new short[]{HEAD,BODY}, null), // TR - O (TH|TD)+ new Element(TR, "TR", Element.BLOCK, new short[]{TBODY, THEAD, TFOOT}, TABLE, new short[]{TD,TH,TR,COLGROUP}), // TT - - (%inline;)* new Element(TT, "TT", Element.INLINE, BODY, null), }; ELEMENTS_ARRAY['U'-'A'] = new Element[] { // U, new Element(U, "U", Element.INLINE, BODY, null), // UL - - (LI)+ new Element(UL, "UL", Element.BLOCK, BODY, null), }; ELEMENTS_ARRAY['V'-'A'] = new Element[] { // VAR - - (%inline;)* new Element(VAR, "VAR", Element.INLINE, BODY, null), }; ELEMENTS_ARRAY['W'-'A'] = new Element[] { // WBR new Element(WBR, "WBR", Element.EMPTY, BODY, null), }; ELEMENTS_ARRAY['X'-'A'] = new Element[] { // XML new Element(XML, "XML", 0, BODY, null), // XMP new Element(XMP, "XMP", Element.SPECIAL, BODY, null), }; // keep contiguous list of elements for lookups by code for (int i = 0; i < ELEMENTS_ARRAY.length; i++) { Element[] elements = ELEMENTS_ARRAY[i]; if (elements != null) { for (int j = 0; j < elements.length; j++) { Element element = elements[j]; ELEMENTS.addElement(element); } } } ELEMENTS.addElement(NO_SUCH_ELEMENT); // initialize cross references to parent elements for (int i = 0; i < ELEMENTS.size; i++) { Element element = ELEMENTS.data[i]; if (element.parentCodes != null) { element.parent = new Element[element.parentCodes.length]; for (int j = 0; j < element.parentCodes.length; j++) { element.parent[j] = ELEMENTS.data[element.parentCodes[j]]; } element.parentCodes = null; } } } // () // // Public static methods // /** * Returns the element information for the specified element code. * * @param code The element code. */ public static final Element getElement(short code) { return ELEMENTS.data[code]; } // getElement(short):Element /** * Returns the element information for the specified element name. * * @param ename The element name. */ public static final Element getElement(String ename) { return getElement(ename, NO_SUCH_ELEMENT); } // getElement(String):Element /** * Returns the element information for the specified element name. * * @param ename The element name. * @param element The default element to return if not found. */ public static final Element getElement(String ename, Element element) { if (ename.length() > 0) { int c = ename.charAt(0); if (c >= 'a' && c <= 'z') { c = 'A' + c - 'a'; } if (c >= 'A' && c <= 'Z') { Element[] elements = ELEMENTS_ARRAY[c - 'A']; if (elements != null) { for (int i = 0; i < elements.length; i++) { Element elem = elements[i]; if (elem.name.equalsIgnoreCase(ename)) { return elem; } } } } } return element; } // getElement(String):Element // // Classes // /** * Element information. * * @author Andy Clark */ public static class Element { // // Constants // /** Inline element. */ public static final int INLINE = 0x01; /** Block element. */ public static final int BLOCK = 0x02; /** Empty element. */ public static final int EMPTY = 0x04; /** Container element. */ public static final int CONTAINER = 0x08; /** Special element. */ public static final int SPECIAL = 0x10; // // Data // /** The element code. */ public short code; /** The element name. */ public String name; /** Informational flags. */ public int flags; /** Parent elements. */ public short[] parentCodes; /** Parent elements. */ public Element[] parent; /** The bounding element code. */ public short bounds; /** List of elements this element can close. */ public short[] closes; /** If set to true, then this element may not be nested, example: "A" **/ boolean nestable = true; // // Constructors // /** * Constructs an element object. * * @param code The element code. * @param name The element name. * @param flags Informational flags * @param parent Natural closing parent name. * @param closes List of elements this element can close. */ public Element(short code, String name, int flags, short parent, short[] closes) { this(code, name, flags, new short[]{parent}, (short)-1, closes); } // (short,String,int,short,short[]); /** * Constructs an element object. * * @param code The element code. * @param name The element name. * @param flags Informational flags * @param parent Natural closing parent name. * @param closes List of elements this element can close. */ public Element(short code, String name, int flags, short parent, short bounds, short[] closes) { this(code, name, flags, new short[]{parent}, bounds, closes); } // (short,String,int,short,short,short[]) /** * Constructs an element object. * * @param code The element code. * @param name The element name. * @param flags Informational flags * @param parents Natural closing parent names. * @param closes List of elements this element can close. */ public Element(short code, String name, int flags, short[] parents, short[] closes) { this(code, name, flags, parents, (short)-1, closes); } // (short,String,int,short[],short[]) /** * Constructs an element object. * * @param code The element code. * @param name The element name. * @param flags Informational flags * @param parents Natural closing parent names. * @param closes List of elements this element can close. */ public Element(short code, String name, int flags, short[] parents, short bounds, short[] closes) { this.code = code; this.name = name; this.flags = flags; this.parentCodes = parents; this.parent = null; this.bounds = bounds; this.closes = closes; if(closes != null) { for(int i=0;i(short,String,int,short[],short,short[]) // // Public methods // /** Returns true if this element is an inline element. */ public final boolean isInline() { return (flags & INLINE) != 0; } // isInline():boolean /** Returns true if this element is a block element. */ public final boolean isBlock() { return (flags & BLOCK) != 0; } // isBlock():boolean /** Returns true if this element is an empty element. */ public final boolean isEmpty() { return (flags & EMPTY) != 0; } // isEmpty():boolean /** Returns true if this element is a container element. */ public final boolean isContainer() { return (flags & CONTAINER) != 0; } // isContainer():boolean /** * Returns true if this element is special -- if its content * should be parsed ignoring markup. */ public final boolean isSpecial() { return (flags & SPECIAL) != 0; } // isSpecial():boolean /** * Returns true if this element can close the specified Element. * * @param tag The element. */ public boolean closes(short tag) { if (closes != null) { for (int i = 0; i < closes.length; i++) { if (closes[i] == tag) { return true; } } } return false; } // closes(short):boolean // // Object methods // /** Returns a hash code for this object. */ public int hashCode() { return name.hashCode(); } // hashCode():int /** Returns true if the objects are equal. */ public boolean equals(Object o) { return name.equals(o); } // equals(Object):boolean /** * Provides a simple representation to make debugging easier */ public String toString() { return super.toString() + "(name=" + name + ")"; } /** * Indicates if the provided element is an accepted parent of current element * @param element the element to test for "paternity" * @return true if element belongs to the {@link #parent} */ public boolean isParent(final Element element) { if (parent == null) return false; else { for (int i=0; i *

  • add missing parent elements; *
  • automatically close elements with optional end tags; and *
  • handle mis-matched inline element tags. * *

    * This component recognizes the following features: *

      *
    • http://cyberneko.org/html/features/augmentations *
    • http://cyberneko.org/html/features/report-errors *
    • http://cyberneko.org/html/features/balance-tags/document-fragment *
    • http://cyberneko.org/html/features/balance-tags/ignore-outside-content *
    *

    * This component recognizes the following properties: *

      *
    • http://cyberneko.org/html/properties/names/elems *
    • http://cyberneko.org/html/properties/names/attrs *
    • http://cyberneko.org/html/properties/error-reporter *
    • http://cyberneko.org/html/properties/balance-tags/current-stack *
    * * @see HTMLElements * * @author Andy Clark * @author Marc Guillemot * * @version $Id: HTMLTagBalancer.java,v 1.20 2005/02/14 04:06:22 andyc Exp $ */ public class HTMLTagBalancer implements XMLDocumentFilter, HTMLComponent { // // Constants // // features /** Namespaces. */ protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces"; /** Include infoset augmentations. */ protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations"; /** Report errors. */ protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors"; /** Document fragment balancing only (deprecated). */ protected static final String DOCUMENT_FRAGMENT_DEPRECATED = "http://cyberneko.org/html/features/document-fragment"; /** Document fragment balancing only. */ protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/balance-tags/document-fragment"; /** Ignore outside content. */ protected static final String IGNORE_OUTSIDE_CONTENT = "http://cyberneko.org/html/features/balance-tags/ignore-outside-content"; /** Recognized features. */ private static final String[] RECOGNIZED_FEATURES = { NAMESPACES, AUGMENTATIONS, REPORT_ERRORS, DOCUMENT_FRAGMENT_DEPRECATED, DOCUMENT_FRAGMENT, IGNORE_OUTSIDE_CONTENT, }; /** Recognized features defaults. */ private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = { null, null, null, null, Boolean.FALSE, Boolean.FALSE, }; // properties /** Modify HTML element names: { "upper", "lower", "default" }. */ protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems"; /** Modify HTML attribute names: { "upper", "lower", "default" }. */ protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs"; /** Error reporter. */ protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter"; /** * EXPERIMENTAL: may change in next release
    * Name of the property holding the stack of elements in which context a document fragment should be parsed. **/ public static final String FRAGMENT_CONTEXT_STACK = "http://cyberneko.org/html/properties/balance-tags/fragment-context-stack"; /** Recognized properties. */ private static final String[] RECOGNIZED_PROPERTIES = { NAMES_ELEMS, NAMES_ATTRS, ERROR_REPORTER, FRAGMENT_CONTEXT_STACK, }; /** Recognized properties defaults. */ private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = { null, null, null, null, }; // modify HTML names /** Don't modify HTML names. */ protected static final short NAMES_NO_CHANGE = 0; /** Match HTML element names. */ protected static final short NAMES_MATCH = 0; /** Uppercase HTML names. */ protected static final short NAMES_UPPERCASE = 1; /** Lowercase HTML names. */ protected static final short NAMES_LOWERCASE = 2; // static vars /** Synthesized event info item. */ protected static final HTMLEventInfo SYNTHESIZED_ITEM = new HTMLEventInfo.SynthesizedItem(); // // Data // // features /** Namespaces. */ protected boolean fNamespaces; /** Include infoset augmentations. */ protected boolean fAugmentations; /** Report errors. */ protected boolean fReportErrors; /** Document fragment balancing only. */ protected boolean fDocumentFragment; /** Ignore outside content. */ protected boolean fIgnoreOutsideContent; // properties /** Modify HTML element names. */ protected short fNamesElems; /** Modify HTML attribute names. */ protected short fNamesAttrs; /** Error reporter. */ protected HTMLErrorReporter fErrorReporter; // connections /** The document source. */ protected XMLDocumentSource fDocumentSource; /** The document handler. */ protected XMLDocumentHandler fDocumentHandler; // state /** The element stack. */ protected final InfoStack fElementStack = new InfoStack(); /** The inline stack. */ protected final InfoStack fInlineStack = new InfoStack(); /** True if seen anything. Important for xml declaration. */ protected boolean fSeenAnything; /** True if root element has been seen. */ protected boolean fSeenDoctype; /** True if root element has been seen. */ protected boolean fSeenRootElement; /** * True if seen the end of the document element. In other words, * this variable is set to false until the end </HTML> * tag is seen (or synthesized). This is used to ensure that * extraneous events after the end of the document element do not * make the document stream ill-formed. */ protected boolean fSeenRootElementEnd; /** True if seen <head< element. */ protected boolean fSeenHeadElement; /** True if seen <body< element. */ protected boolean fSeenBodyElement; /** True if a form is in the stack (allow to discard opening of nested forms) */ protected boolean fOpenedForm; // temp vars /** A qualified name. */ private final QName fQName = new QName(); /** Empty attributes. */ private final XMLAttributes fEmptyAttrs = new XMLAttributesImpl(); /** Augmentations. */ private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations(); protected HTMLTagBalancingListener tagBalancingListener; private LostText lostText_ = new LostText(); private boolean forcedStartElement_ = false; private boolean forcedEndElement_ = false; /** * Stack of elements determining the context in which a document fragment should be parsed */ private QName[] fragmentContextStack_ = null; private int fragmentContextStackSize_ = 0; // not 0 only when a fragment is parsed and fragmentContextStack_ is set private List/*ElementEntry*/ endElementsBuffer_ = new ArrayList(); // // HTMLComponent methods // /** Returns the default state for a feature. */ public Boolean getFeatureDefault(String featureId) { int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0; for (int i = 0; i < length; i++) { if (RECOGNIZED_FEATURES[i].equals(featureId)) { return RECOGNIZED_FEATURES_DEFAULTS[i]; } } return null; } // getFeatureDefault(String):Boolean /** Returns the default state for a property. */ public Object getPropertyDefault(String propertyId) { int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0; for (int i = 0; i < length; i++) { if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) { return RECOGNIZED_PROPERTIES_DEFAULTS[i]; } } return null; } // getPropertyDefault(String):Object // // XMLComponent methods // /** Returns recognized features. */ public String[] getRecognizedFeatures() { return RECOGNIZED_FEATURES; } // getRecognizedFeatures():String[] /** Returns recognized properties. */ public String[] getRecognizedProperties() { return RECOGNIZED_PROPERTIES; } // getRecognizedProperties():String[] /** Resets the component. */ public void reset(XMLComponentManager manager) throws XMLConfigurationException { // get features fNamespaces = manager.getFeature(NAMESPACES); fAugmentations = manager.getFeature(AUGMENTATIONS); fReportErrors = manager.getFeature(REPORT_ERRORS); fDocumentFragment = manager.getFeature(DOCUMENT_FRAGMENT) || manager.getFeature(DOCUMENT_FRAGMENT_DEPRECATED); fIgnoreOutsideContent = manager.getFeature(IGNORE_OUTSIDE_CONTENT); // get properties fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS))); fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS))); fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER); fragmentContextStack_ = (QName[]) manager.getProperty(FRAGMENT_CONTEXT_STACK); } // reset(XMLComponentManager) /** Sets a feature. */ public void setFeature(String featureId, boolean state) throws XMLConfigurationException { if (featureId.equals(AUGMENTATIONS)) { fAugmentations = state; return; } if (featureId.equals(REPORT_ERRORS)) { fReportErrors = state; return; } if (featureId.equals(IGNORE_OUTSIDE_CONTENT)) { fIgnoreOutsideContent = state; return; } } // setFeature(String,boolean) /** Sets a property. */ public void setProperty(String propertyId, Object value) throws XMLConfigurationException { if (propertyId.equals(NAMES_ELEMS)) { fNamesElems = getNamesValue(String.valueOf(value)); return; } if (propertyId.equals(NAMES_ATTRS)) { fNamesAttrs = getNamesValue(String.valueOf(value)); return; } } // setProperty(String,Object) // // XMLDocumentSource methods // /** Sets the document handler. */ public void setDocumentHandler(XMLDocumentHandler handler) { fDocumentHandler = handler; } // setDocumentHandler(XMLDocumentHandler) // @since Xerces 2.1.0 /** Returns the document handler. */ public XMLDocumentHandler getDocumentHandler() { return fDocumentHandler; } // getDocumentHandler():XMLDocumentHandler // // XMLDocumentHandler methods // // since Xerces-J 2.2.0 /** Start document. */ public void startDocument(XMLLocator locator, String encoding, NamespaceContext nscontext, Augmentations augs) throws XNIException { // reset state fElementStack.top = 0; if (fragmentContextStack_ != null) { fragmentContextStackSize_ = fragmentContextStack_.length; for (int i=0; i and have been buffered to consider outside content fIgnoreOutsideContent = true; // endElement should not ignore the elements passed from buffer consumeBufferedEndElements(); // handle empty document if (!fSeenRootElement && !fDocumentFragment) { if (fReportErrors) { fErrorReporter.reportError("HTML2000", null); } if (fDocumentHandler != null) { fSeenRootElementEnd = false; forceStartBody(); // will force and final String body = modifyName("body", fNamesElems); fQName.setValues(null, body, body, null); callEndElement(fQName, synthesizedAugs()); final String ename = modifyName("html", fNamesElems); fQName.setValues(null, ename, ename, null); callEndElement(fQName, synthesizedAugs()); } } // pop all remaining elements else { int length = fElementStack.top - fragmentContextStackSize_; for (int i = 0; i < length; i++) { Info info = fElementStack.pop(); if (fReportErrors) { String ename = info.qname.rawname; fErrorReporter.reportWarning("HTML2001", new Object[]{ename}); } if (fDocumentHandler != null) { callEndElement(info.qname, synthesizedAugs()); } } } // call handler if (fDocumentHandler != null) { fDocumentHandler.endDocument(augs); } } // endDocument(Augmentations) /** * Consume elements that have been buffered, like that are first consumed * at the end of document */ private void consumeBufferedEndElements() { final List toConsume = new ArrayList(endElementsBuffer_); endElementsBuffer_.clear(); for (int i=0; i (if any) has been buffered } else if (elementCode == HTMLElements.BODY) { // create if none was present if (!fSeenHeadElement) { final QName head = createQName("head"); forceStartElement(head, null, synthesizedAugs()); endElement(head, synthesizedAugs()); } consumeBufferedEndElements(); // (if any) has been buffered if (fSeenBodyElement) { notifyDiscardedStartElement(elem, attrs, augs); return; } fSeenBodyElement = true; } else if (elementCode == HTMLElements.FORM) { if (fOpenedForm) { notifyDiscardedStartElement(elem, attrs, augs); return; } fOpenedForm = true; } else if (elementCode == HTMLElements.UNKNOWN) { consumeBufferedEndElements(); } // check proper parent if (element.parent != null) { if (!fSeenRootElement && !fDocumentFragment) { String pname = element.parent[0].name; pname = modifyName(pname, fNamesElems); if (fReportErrors) { String ename = elem.rawname; fErrorReporter.reportWarning("HTML2002", new Object[]{ename,pname}); } final QName qname = new QName(null, pname, pname, null); final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs()); if (!parentCreated) { if (!isForcedCreation) { notifyDiscardedStartElement(elem, attrs, augs); } return; } } else { HTMLElements.Element preferedParent = element.parent[0]; if (preferedParent.code != HTMLElements.HEAD || (!fSeenBodyElement && !fDocumentFragment)) { int depth = getParentDepth(element.parent, element.bounds); if (depth == -1) { // no parent found final String pname = modifyName(preferedParent.name, fNamesElems); final QName qname = new QName(null, pname, pname, null); if (fReportErrors) { String ename = elem.rawname; fErrorReporter.reportWarning("HTML2004", new Object[]{ename,pname}); } final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs()); if (!parentCreated) { if (!isForcedCreation) { notifyDiscardedStartElement(elem, attrs, augs); } return; } } } } } // if block element, save immediate parent inline elements int depth = 0; if (element.flags == 0) { int length = fElementStack.top; fInlineStack.top = 0; for (int i = length - 1; i >= 0; i--) { Info info = fElementStack.data[i]; if (!info.element.isInline()) { break; } fInlineStack.push(info); endElement(info.qname, synthesizedAugs()); } depth = fInlineStack.top; } // close previous elements // all elements close a