jsilver-1.0.0.dfsg.orig/ 0000755 0001750 0001750 00000000000 11767454573 014074 5 ustar chris chris jsilver-1.0.0.dfsg.orig/build.xml 0000644 0001750 0001750 00000003161 11427052602 015672 0 ustar chris chris
Note: These are the exact methods exposed in the original C++ Parser. The * names are simply modified to conform to Java. */ public interface HtmlParser extends Parser { /** * The Parser Mode requested for parsing a given template. * Currently we support: *
class="someClass" target="_blank"
Which could be included from a parent template that contains * an anchor tag, say:
*<a href="/bla" ["INCLUDED_TEMPLATE"]>
All these states map exactly to those exposed in the C++ (original) * version of the HtmlParser. */ public final static ExternalState STATE_TEXT = new ExternalState("STATE_TEXT"); public final static ExternalState STATE_TAG = new ExternalState("STATE_TAG"); public final static ExternalState STATE_COMMENT = new ExternalState("STATE_COMMENT"); public final static ExternalState STATE_ATTR = new ExternalState("STATE_ATTR"); public final static ExternalState STATE_VALUE = new ExternalState("STATE_VALUE"); public final static ExternalState STATE_JS_FILE = new ExternalState("STATE_JS_FILE"); public final static ExternalState STATE_CSS_FILE = new ExternalState("STATE_CSS_FILE"); /** * Returns {@code true} if the parser is currently processing Javascript. * Such is the case if and only if, the parser is processing an attribute * that takes Javascript, a Javascript script block or the parser * is (re)set with {@link Mode#JS}. * * @return {@code true} if the parser is processing Javascript, * {@code false} otherwise */ public boolean inJavascript(); /** * Returns {@code true} if the parser is currently processing * a Javascript litteral that is quoted. The caller will typically * invoke this method after determining that the parser is processing * Javascript. Knowing whether the element is quoted or not helps * determine which escaping to apply to it when needed. * * @return {@code true} if and only if the parser is inside a quoted * Javascript literal */ public boolean isJavascriptQuoted(); /** * Returns {@code true} if and only if the parser is currently within * an attribute, be it within the attribute name or the attribute value. * * @return {@code true} if and only if inside an attribute */ public boolean inAttribute(); /** * Returns {@code true} if and only if the parser is currently within * a CSS context. A CSS context is one of the below: *
This is useful to determine which escaping to apply based * on the type of value this attribute expects. * * @return type of the attribute * @see HtmlParser.ATTR_TYPE */ public ATTR_TYPE getAttributeType(); /** * Returns {@code true} if and only if the parser is currently within * an attribute value and that attribute value is quoted. * * @return {@code true} if and only if the attribute value is quoted */ public boolean isAttributeQuoted(); /** * Returns the name of the HTML tag if the parser is currently within one. * Note that the name may be incomplete if the parser is currently still * parsing the name. Returns an empty {@code String} if the parser is not * in a tag as determined by {@code getCurrentExternalState}. * * @return the name of the HTML tag or an empty {@code String} if we are * not within an HTML tag */ public String getTag(); /** * Returns the name of the HTML attribute the parser is currently processing. * If the parser is still parsing the name, then the returned name * may be incomplete. Returns an empty {@code String} if the parser is not * in an attribute as determined by {@code getCurrentExternalState}. * * @return the name of the HTML attribute or an empty {@code String} * if we are not within an HTML attribute */ public String getAttribute(); /** * Returns the value of an HTML attribute if the parser is currently * within one. If the parser is currently parsing the value, the returned * value may be incomplete. The caller will typically first determine * that the parser is processing a value by calling * {@code getCurrentExternalState}. * * @return the value, could be an empty {@code String} if the parser is not * in an HTML attribute value */ public String getValue(); /** * Returns the current position of the parser within the HTML attribute * value, zero being the position of the first character in the value. * The caller will typically first determine that the parser is * processing a value by calling {@link #getState()}. * * @return the index or zero if the parser is not processing a value */ public int getValueIndex(); /** * Returns {@code true} if and only if the current position of the parser is * at the start of a URL HTML attribute value. This is the case when the * following three conditions are all met: *
*
This method may be used by an Html Sanitizer or an Auto-Escape system * to determine whether to validate the URL for well-formedness and validate * the scheme of the URL (e.g. {@code HTTP}, {@code HTTPS}) is safe. * In particular, it is recommended to use this method instead of * checking that {@link #getValueIndex()} is {@code 0} to support attribute * types where the URL does not start at index zero, such as the * {@code content} attribute of the {@code meta} HTML tag. * * @return {@code true} if and only if the parser is at the start of the URL */ public boolean isUrlStart(); /** * Resets the state of the parser, allowing for reuse of the * {@code HtmlParser} object. * *
See the {@link HtmlParser.Mode} enum for information on all * the valid modes. * * @param mode is an enum representing the high-level state of the parser */ public void resetMode(HtmlParser.Mode mode); /** * A specialized directive to tell the parser there is some content * that will be inserted here but that it will not get to parse. Used * by the template system that may not be able to give some content * to the parser but wants it to know there typically will be content * inserted at that point. This is a hint used in corner cases within * parsing of HTML attribute names and values where content we do not * get to see could affect our parsing and alter our current state. * *
Returns {@code false} if and only if the parser encountered * a fatal error which prevents it from continuing further parsing. * *
Note: The return value is different from the C++ Parser which * always returns {@code true} but in my opinion makes more sense. * * @throws ParseException if an unrecoverable error occurred during parsing */ public void insertText() throws ParseException; /** * Returns the state the Javascript parser is in. * *
See {@link JavascriptParser} for more information on the valid
* external states. The caller will typically first determine that the
* parser is processing Javascript and then invoke this method to
* obtain more fine-grained state information.
*
* @return external state of the javascript parser
*/
public ExternalState getJavascriptState();
}
jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/ExternalState.java 0000644 0001750 0001750 00000005021 11427052602 025712 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser;
import com.google.common.base.Preconditions;
/**
* A representation of the parser state suitable for use by the caller
* of the Parser. The meaning of each state and therefore which action
* the caller should perform on that state is not self-evident. In particular,
* it depends on which parser is used (currently {@link HtmlParser} and
* {@link JavascriptParser}). For examples, you will have to look
* at the Google Template System
and ClearSilver
* both of which support Auto-Escaping by interfacing with our parser
* (using the parser written in C++).
*
*
The caller of the Parser will query for the current parser state at * points of interest during parsing of templates. Based on the parser's * current state as represented by this class, the caller can determine * the appropriate escaping to apply. * *
Note: Given this class is external-facing, I considered creating * an interface but it is not likely we'll ever need to add more flexibility * and the class is so simple, I figured it was not warranted. * * * @see HtmlParser * @see JavascriptParser */ public class ExternalState { private final String name; /** * Creates an {@code ExternalState} object. * * @param name the name to assign to that state * @see HtmlParser * @see JavascriptParser */ public ExternalState(String name) { Preconditions.checkNotNull(name); // Developer error if it happens. this.name = name; } /** * Returns the name of the object. The name is only needed * to provide human-readable information when debugging. * * @return the name of that object */ public String getName() { return name; } /** * Returns the string representation of this external state. * The details of this representation are subject to change. */ @Override public String toString() { return String.format("ExternalState: %s", name); } } jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/JavascriptParser.java 0000644 0001750 0001750 00000002730 11427052602 026416 0 ustar chris chris /* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser; /** * Methods exposed for Javascript parsing of text to facilitate implementation * of Automatic context-aware escaping. This interface does not add * additional methods on top of {@code Parser} for the time being, * it simply exposes the states in which the Javascript parser may be in. * *
Note: These are the exact states exposed in the original C++ Parser. */ public interface JavascriptParser extends Parser { public static final ExternalState STATE_TEXT = new ExternalState("STATE_TEXT"); public static final ExternalState STATE_Q = new ExternalState("STATE_Q"); public static final ExternalState STATE_DQ = new ExternalState("STATE_DQ"); public static final ExternalState STATE_REGEXP = new ExternalState("STATE_REGEXP"); public static ExternalState STATE_COMMENT = new ExternalState("STATE_COMMENT"); } jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/impl/ 0000755 0001750 0001750 00000000000 11767454572 023252 5 ustar chris chris jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/impl/GenericParser.java 0000644 0001750 0001750 00000022232 11427052602 026624 0 ustar chris chris /* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser.impl; import com.google.common.base.Preconditions; import com.google.streamhtmlparser.ExternalState; import com.google.streamhtmlparser.Parser; import com.google.streamhtmlparser.ParseException; import com.google.streamhtmlparser.util.HtmlUtils; import java.util.Map; /** * An implementation of the {@code Parser} interface that is common to both * {@code HtmlParser} and {@code JavascriptParser}. * *
Provides methods for parsing input and ensuring that all in-state, * entering-a-state and exiting-a-state callbacks are invoked as appropriate. * *
This class started as abstract but it was found better for testing to
* make it instantiatable so that the parsing logic can be tested with dummy
* state transitions.
*/
public class GenericParser implements Parser {
protected final ParserStateTable parserStateTable;
protected final Map Absent any callbacks defined, this function simply determines the
* next state to switch to based on the However some states have specific callbacks defined which when
* receiving specific characters may decide to overwrite the next state to
* go to. Hence the next state is a function both of the main state table
* in {@code ParserStateTable} as well as specific run-time information
* from the callback functions.
*
* Also note that the callbacks are called in a proper sequence,
* first the exit-state one then the enter-state one and finally the
* in-state one. Changing the order may result in a functional change.
*
* @param input the input character to parse (process)
* @throws ParseException if an unrecoverable error occurred during parsing
*/
@Override
public void parse(char input) throws ParseException {
InternalState nextState =
parserStateTable.getNextState(currentState, input);
if (nextState == InternalState.INTERNAL_ERROR_STATE) {
String errorMsg =
String.format("Unexpected character '%s' in int_state '%s' " +
"(ext_state '%s')",
HtmlUtils.encodeCharForAscii(input),
currentState.getName(), getState().getName());
currentState = InternalState.INTERNAL_ERROR_STATE;
throw new ParseException(this, errorMsg);
}
if (currentState != nextState) {
nextState = handleExitState(currentState, nextState, input);
}
if (currentState != nextState) {
nextState = handleEnterState(nextState, nextState, input);
}
nextState = handleInState(nextState, input);
currentState = nextState;
record(input);
columnNumber++;
if (input == '\n') {
lineNumber++;
columnNumber = 1;
}
}
/**
* Return the current state of the parser.
*/
@Override
public ExternalState getState() {
if (!intToExtStateTable.containsKey(currentState)) {
throw new NullPointerException("Did not find external state mapping " +
"For internal state: " + currentState);
}
return intToExtStateTable.get(currentState);
}
/**
* Reset the parser back to its initial default state.
*/
@Override
public void reset() {
currentState = initialState;
lineNumber = 1;
columnNumber = 1;
}
/**
* Sets the current line number which is returned during error messages.
*/
@Override
public void setLineNumber(int lineNumber) {
this.lineNumber = lineNumber;
}
/**
* Returns the current line number.
*/
@Override
public int getLineNumber() {
return lineNumber;
}
/**
* Sets the current column number which is returned during error messages.
*/
@Override
public void setColumnNumber(int columnNumber) {
this.columnNumber = columnNumber;
}
/**
* Returns the current column number.
*/
@Override
public int getColumnNumber() {
return columnNumber;
}
InternalState getCurrentInternalState() {
return currentState;
}
protected void setNextState(InternalState nextState) throws ParseException {
Preconditions.checkNotNull(nextState); // Developer error if it triggers.
/* We are not actually parsing hence providing
* a null char to the event handlers.
*/
// TODO: Complicated logic to follow in C++ but clean it up.
final char nullChar = '\0';
if (currentState != nextState) {
nextState = handleExitState(currentState, nextState, nullChar);
}
if (currentState != nextState) {
handleEnterState(nextState, nextState, nullChar);
}
currentState = nextState;
}
/**
* Invoked when the parser enters a new state.
*
* @param currentState the current state of the parser
* @param expectedNextState the next state according to the
* state table definition
* @param input the last character parsed
* @return the state to change to, could be the same as the
* {@code expectedNextState} provided
* @throws ParseException if an unrecoverable error occurred during parsing
*/
protected InternalState handleEnterState(InternalState currentState,
InternalState expectedNextState,
char input) throws ParseException {
return expectedNextState;
}
/**
* Invoked when the parser exits a state.
*
* @param currentState the current state of the parser
* @param expectedNextState the next state according to the
* state table definition
* @param input the last character parsed
* @return the state to change to, could be the same as the
* {@code expectedNextState} provided
* @throws ParseException if an unrecoverable error occurred during parsing
*/
protected InternalState handleExitState(InternalState currentState,
InternalState expectedNextState,
char input) throws ParseException {
return expectedNextState;
}
/**
* Invoked for each character read when no state change occured.
*
* @param currentState the current state of the parser
* @param input the last character parsed
* @return the state to change to, could be the same as the
* {@code expectedNextState} provided
* @throws ParseException if an unrecoverable error occurred during parsing
*/
protected InternalState handleInState(InternalState currentState,
char input) throws ParseException {
return currentState;
}
/**
* Perform some processing on the given character. Derived classes
* may override this method in order to perform additional logic
* on every processed character beyond the logic defined in
* state transitions.
*
* @param input the input character to operate on
*/
protected void record(char input) { }
}
jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/impl/InternalState.java 0000644 0001750 0001750 00000007303 11427052602 026652 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser.impl;
import com.google.common.base.Preconditions;
import java.util.concurrent.atomic.AtomicInteger;
/**
* A very simple representation of the parser internal state. The state
* contains a small integer identifier (from 1 to 255) to allow for
* the implementation of a simple finite state machine. The name is
* purely informational.
*
* In order to eliminate the possibility that different states have
* the same identifier, this class manages the idenitifiers themselves.
* The HTML and Javascript parser states are managed elsewhere in different
* "namespaces" hence will not clash and there is no current need for this
* class to disambiguate them further.
*
* The methods to create new This is the main class in the package. It implements the
* {@code HtmlParser} interface.
*
* This class is not thread-safe, in particular you cannot invoke any
* state changing operations (such as {@code parse} from multiple threads
* on the same object.
*
* If you are looking at this class, chances are very high you are
* implementing Auto-Escaping for a new template system. Please see the
* landing page including a design document at
* Auto-Escape Landing Page.
*/
public class HtmlParserImpl extends GenericParser implements HtmlParser {
/*
* Internal representation of the parser state, which is at a
* finer-granularity than the external state as given to callers.
* The relationship between Both for performance reasons and to leverage code a state-flow machine
* that is automatically generated from Python for multiple target
* languages, this object uses a static {@code ParserStateTable} that
* is read-only and obtained from the generated code in {@code HtmlParserFsm}.
* That code also maintains the mapping from internal states
* ({@code InternalState}) to external states ({@code ExternalState}).
*/
public HtmlParserImpl() {
super(STATE_TABLE, STATE_MAPPING, TEXT);
tag = new CharacterRecorder();
attr = new CharacterRecorder();
value = new CharacterRecorder();
cdataCloseTag = new CharacterRecorder();
entityResolver = new EntityResolver();
jsParser = new JavascriptParserImpl();
insideJavascript = false;
valueIndex = 0;
textInsideUrlValue = false;
}
/**
* Creates an {@code HtmlParserImpl} that is a copy of the one provided.
*
* @param aHtmlParserImpl the {@code HtmlParserImpl} object to copy
*/
public HtmlParserImpl(HtmlParserImpl aHtmlParserImpl) {
super(aHtmlParserImpl);
tag = new CharacterRecorder(aHtmlParserImpl.tag);
attr = new CharacterRecorder(aHtmlParserImpl.attr);
value = new CharacterRecorder(aHtmlParserImpl.value);
cdataCloseTag = new CharacterRecorder(aHtmlParserImpl.cdataCloseTag);
entityResolver = new EntityResolver(aHtmlParserImpl.entityResolver);
jsParser = new JavascriptParserImpl(aHtmlParserImpl.jsParser);
insideJavascript = aHtmlParserImpl.insideJavascript;
valueIndex = aHtmlParserImpl.valueIndex;
textInsideUrlValue = aHtmlParserImpl.textInsideUrlValue;
}
@Override
public boolean inJavascript() {
return (insideJavascript
&& ( (getState() == STATE_VALUE)
|| (currentState == CDATA_TEXT)
|| (currentState == CDATA_COM_START)
|| (currentState == CDATA_COM_START_DASH)
|| (currentState == CDATA_COM_BODY)
|| (currentState == CDATA_COM_DASH)
|| (currentState == CDATA_COM_DASH_DASH)
|| (currentState == CDATA_LT)
|| (currentState == CDATA_MAY_CLOSE)
|| (currentState == JS_FILE) ));
}
@Override
public boolean isJavascriptQuoted() {
if (inJavascript()) {
ExternalState jsParserState = jsParser.getState();
return (jsParserState == JavascriptParserImpl.STATE_Q
|| jsParserState == JavascriptParserImpl.STATE_DQ);
}
return false;
}
@Override
public boolean inAttribute() {
ExternalState extState = getState();
return (extState != null && (extState == STATE_ATTR
|| extState == STATE_VALUE));
}
/**
* Returns {@code true} if and only if the parser is currently within
* a CSS context. A CSS context is one of the below:
* See the {@link HtmlParser.Mode} enum for information on all
* the valid modes.
*
* @param mode is an enum representing the high-level state of the parser
*/
@Override
public void resetMode(Mode mode) {
insideJavascript = false;
tag.reset();
attr.reset();
value.reset();
cdataCloseTag.reset();
valueIndex = 0;
textInsideUrlValue = false;
jsParser.reset();
switch (mode) {
case HTML:
currentState = TEXT;
break;
case JS:
currentState = JS_FILE;
insideJavascript = true;
break;
case CSS:
currentState = CSS_FILE;
break;
case HTML_IN_TAG:
currentState = TAG_SPACE;
break;
default:
throw new IllegalArgumentException("Did not recognize Mode: " +
mode.toString());
}
}
/**
* Resets the state of the parser to the initial state of parsing HTML.
*/
public void reset() {
super.reset();
resetMode(Mode.HTML);
}
/**
* A specialized directive to tell the parser there is some content
* that will be inserted here but that it will not get to parse. Used
* by the template system that may not be able to give some content
* to the parser but wants it to know there typically will be content
* inserted at that point. This is a hint used in corner cases within
* parsing of HTML attribute names and values where content we do not
* get to see could affect our parsing and alter our current state.
*
* The two cases where {@code #insertText()} affects our parsing are:
* Called for every character inside an attribute value.
*
* @param input character read
* @throws ParseException if an unrecoverable error occurred during parsing
*/
private void inStateValue(char input) throws ParseException {
valueIndex++;
if (insideJavascript) {
EntityResolver.Status status = entityResolver.processChar(input);
if (status == EntityResolver.Status.COMPLETED) {
jsParser.parse(entityResolver.getEntity());
entityResolver.reset();
} else if (status == EntityResolver.Status.NOT_STARTED) {
jsParser.parse(input);
}
}
}
/**
* Handles the tag it finished reading.
*
* For a script tag, it initializes the javascript parser. For all
* tags that are recognized to have CDATA values
* (including the script tag), it switches the CDATA state to handle them
* properly. For code simplification, CDATA and RCDATA sections are
* treated the same.
*
* Called when the parser leaves a tag definition.
*
* @param state current state
* @return state next state, could be the same as current state
*/
private InternalState tagClose(InternalState state) {
InternalState nextState = state;
String tagName = getTag();
if ("script".equals(tagName)) {
nextState = CDATA_TEXT;
jsParser.reset();
insideJavascript = true;
} else if ("style".equals(tagName)
|| "title".equals(tagName)
|| "textarea".equals(tagName)) {
nextState = CDATA_TEXT;
insideJavascript = false;
}
return nextState;
}
/**
* Feeds the character to the javascript parser for processing.
*
* Called inside CDATA blocks to parse javascript.
*
* @param input character read
* @throws ParseException if an unrecoverable error occurred during parsing
*/
private void inStateCdata(char input) throws ParseException {
if (insideJavascript) {
jsParser.parse(input);
}
}
/**
* Starts recording. This is so we find the closing tag name in order to
* know if the tag is going to be closed or not.
*
* Called when encountering a '<' character in a CDATA section.
*/
private void enterStateCdataMayClose() {
cdataCloseTag.startRecording();
}
/**
* Determines whether to close the tag element, It closes it if it finds
* the corresponding end tag. Called when reading what could be a
* closing CDATA tag.
*
* @param input the character read
* @param expectedNextState the expected state to go to next
* unless we want to change it here
* @return the next state to go to
*/
private InternalState exitStateCdataMayClose(
InternalState expectedNextState,
char input) {
InternalState nextState = expectedNextState;
cdataCloseTag.stopRecording();
String cdataCloseTagString = cdataCloseTag.getContent();
Preconditions.checkState(!cdataCloseTagString.isEmpty()
&& cdataCloseTagString.charAt(0) == '/'); // Developer error.
if (cdataCloseTagString.substring(1).equalsIgnoreCase(getTag())
&& (input == '>' || HtmlUtils.isHtmlSpace(input))) {
tag.clear();
insideJavascript = false;
} else {
nextState = CDATA_TEXT;
}
return nextState;
}
// ======================================================= //
// SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE. //
// ======================================================= //
private static void registerMapping(InternalState internalState,
ExternalState externalState) {
STATE_MAPPING.put(internalState, externalState);
}
private static void initializeStateMapping() {
// Each parser implementation must map the error state appropriately.
registerMapping(InternalState.INTERNAL_ERROR_STATE, HtmlParser.STATE_ERROR);
registerMapping(TEXT, HtmlParser.STATE_TEXT);
registerMapping(TAG_START, HtmlParser.STATE_TAG);
registerMapping(TAG_NAME, HtmlParser.STATE_TAG);
registerMapping(DECL_START, HtmlParser.STATE_TEXT);
registerMapping(DECL_BODY, HtmlParser.STATE_TEXT);
registerMapping(COM_OPEN, HtmlParser.STATE_TEXT);
registerMapping(COM_BODY, HtmlParser.STATE_COMMENT);
registerMapping(COM_DASH, HtmlParser.STATE_COMMENT);
registerMapping(COM_DASH_DASH, HtmlParser.STATE_COMMENT);
registerMapping(PI, HtmlParser.STATE_TEXT);
registerMapping(PI_MAY_END, HtmlParser.STATE_TEXT);
registerMapping(TAG_SPACE, HtmlParser.STATE_TAG);
registerMapping(TAG_CLOSE, HtmlParser.STATE_TEXT);
registerMapping(ATTR, HtmlParser.STATE_ATTR);
registerMapping(ATTR_SPACE, HtmlParser.STATE_ATTR);
registerMapping(VALUE, HtmlParser.STATE_VALUE);
registerMapping(VALUE_TEXT, HtmlParser.STATE_VALUE);
registerMapping(VALUE_Q_START, HtmlParser.STATE_VALUE);
registerMapping(VALUE_Q, HtmlParser.STATE_VALUE);
registerMapping(VALUE_DQ_START, HtmlParser.STATE_VALUE);
registerMapping(VALUE_DQ, HtmlParser.STATE_VALUE);
registerMapping(CDATA_COM_START, HtmlParser.STATE_TEXT);
registerMapping(CDATA_COM_START_DASH, HtmlParser.STATE_TEXT);
registerMapping(CDATA_COM_BODY, HtmlParser.STATE_TEXT);
registerMapping(CDATA_COM_DASH, HtmlParser.STATE_TEXT);
registerMapping(CDATA_COM_DASH_DASH, HtmlParser.STATE_TEXT);
registerMapping(CDATA_TEXT, HtmlParser.STATE_TEXT);
registerMapping(CDATA_LT, HtmlParser.STATE_TEXT);
registerMapping(CDATA_MAY_CLOSE, HtmlParser.STATE_TEXT);
registerMapping(JS_FILE, HtmlParser.STATE_JS_FILE);
registerMapping(CSS_FILE, HtmlParser.STATE_CSS_FILE);
}
private static void registerTransition(String expression,
InternalState source,
InternalState to) {
// It seems to silly to go through a StateTableTransition here
// but it adds extra data checking.
StateTableTransition stt = new StateTableTransition(expression,
source, to);
STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
stt.getTo());
}
// NOTE: The "[:default:]" transition should be registered before any
// other transitions for a given state or it will over-write them.
private static void initializeParserStateTable() {
registerTransition("[:default:]", CSS_FILE, CSS_FILE);
registerTransition("[:default:]", JS_FILE, JS_FILE);
registerTransition("[:default:]", CDATA_MAY_CLOSE, CDATA_TEXT);
registerTransition(" \t\n\r", CDATA_MAY_CLOSE, TAG_SPACE);
registerTransition(">", CDATA_MAY_CLOSE, TEXT);
registerTransition("A-Za-z0-9/_:-", CDATA_MAY_CLOSE, CDATA_MAY_CLOSE);
registerTransition("[:default:]", CDATA_LT, CDATA_TEXT);
registerTransition("!", CDATA_LT, CDATA_COM_START);
registerTransition("/", CDATA_LT, CDATA_MAY_CLOSE);
registerTransition("[:default:]", CDATA_TEXT, CDATA_TEXT);
registerTransition("<", CDATA_TEXT, CDATA_LT);
registerTransition("[:default:]", CDATA_COM_DASH_DASH, CDATA_COM_BODY);
registerTransition(">", CDATA_COM_DASH_DASH, CDATA_TEXT);
registerTransition("-", CDATA_COM_DASH_DASH, CDATA_COM_DASH_DASH);
registerTransition("[:default:]", CDATA_COM_DASH, CDATA_COM_BODY);
registerTransition("-", CDATA_COM_DASH, CDATA_COM_DASH_DASH);
registerTransition("[:default:]", CDATA_COM_BODY, CDATA_COM_BODY);
registerTransition("-", CDATA_COM_BODY, CDATA_COM_DASH);
registerTransition("[:default:]", CDATA_COM_START_DASH, CDATA_TEXT);
registerTransition("-", CDATA_COM_START_DASH, CDATA_COM_BODY);
registerTransition("[:default:]", CDATA_COM_START, CDATA_TEXT);
registerTransition("-", CDATA_COM_START, CDATA_COM_START_DASH);
registerTransition("[:default:]", VALUE_DQ, VALUE_DQ);
registerTransition("\"", VALUE_DQ, TAG_SPACE);
registerTransition("[:default:]", VALUE_DQ_START, VALUE_DQ);
registerTransition("\"", VALUE_DQ_START, TAG_SPACE);
registerTransition("[:default:]", VALUE_Q, VALUE_Q);
registerTransition("\'", VALUE_Q, TAG_SPACE);
registerTransition("[:default:]", VALUE_Q_START, VALUE_Q);
registerTransition("\'", VALUE_Q_START, TAG_SPACE);
registerTransition("[:default:]", VALUE_TEXT, VALUE_TEXT);
registerTransition(" \t\n\r", VALUE_TEXT, TAG_SPACE);
registerTransition(">", VALUE_TEXT, TAG_CLOSE);
registerTransition("[:default:]", VALUE, VALUE_TEXT);
registerTransition(">", VALUE, TAG_CLOSE);
registerTransition(" \t\n\r", VALUE, VALUE);
registerTransition("\"", VALUE, VALUE_DQ_START);
registerTransition("\'", VALUE, VALUE_Q_START);
registerTransition("=", ATTR_SPACE, VALUE);
registerTransition("/", ATTR_SPACE, TAG_SPACE);
registerTransition("A-Za-z0-9_:-", ATTR_SPACE, ATTR);
registerTransition(" \t\n\r", ATTR_SPACE, ATTR_SPACE);
registerTransition(">", ATTR_SPACE, TAG_CLOSE);
registerTransition(" \t\n\r", ATTR, ATTR_SPACE);
registerTransition("=", ATTR, VALUE);
registerTransition("/", ATTR, TAG_SPACE);
registerTransition(">", ATTR, TAG_CLOSE);
registerTransition("A-Za-z0-9_:.-", ATTR, ATTR);
registerTransition("[:default:]", TAG_CLOSE, TEXT);
registerTransition("<", TAG_CLOSE, TAG_START);
registerTransition("/", TAG_SPACE, TAG_SPACE);
registerTransition("A-Za-z0-9_:-", TAG_SPACE, ATTR);
registerTransition(" \t\n\r", TAG_SPACE, TAG_SPACE);
registerTransition(">", TAG_SPACE, TAG_CLOSE);
registerTransition("[:default:]", PI_MAY_END, PI);
registerTransition(">", PI_MAY_END, TEXT);
registerTransition("[:default:]", PI, PI);
registerTransition("?", PI, PI_MAY_END);
registerTransition("[:default:]", COM_DASH_DASH, COM_BODY);
registerTransition(">", COM_DASH_DASH, TEXT);
registerTransition("-", COM_DASH_DASH, COM_DASH_DASH);
registerTransition("[:default:]", COM_DASH, COM_BODY);
registerTransition("-", COM_DASH, COM_DASH_DASH);
registerTransition("[:default:]", COM_BODY, COM_BODY);
registerTransition("-", COM_BODY, COM_DASH);
registerTransition("[:default:]", COM_OPEN, TEXT);
registerTransition("-", COM_OPEN, COM_BODY);
registerTransition("[:default:]", DECL_BODY, DECL_BODY);
registerTransition(">", DECL_BODY, TEXT);
registerTransition("[:default:]", DECL_START, DECL_BODY);
registerTransition(">", DECL_START, TEXT);
registerTransition("-", DECL_START, COM_OPEN);
registerTransition(">", TAG_NAME, TAG_CLOSE);
registerTransition(" \t\n\r", TAG_NAME, TAG_SPACE);
registerTransition("A-Za-z0-9/_:-", TAG_NAME, TAG_NAME);
// Manual change to remain in-sync with CL 10597850 in C HtmlParser.
registerTransition("[:default:]", TAG_START, TEXT);
registerTransition("<", TAG_START, TAG_START);
// End of manual change.
registerTransition("!", TAG_START, DECL_START);
registerTransition("?", TAG_START, PI);
registerTransition("A-Za-z0-9/_:-", TAG_START, TAG_NAME);
registerTransition("[:default:]", TEXT, TEXT);
registerTransition("<", TEXT, TAG_START);
}
}
jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/impl/ParserStateTable.java 0000644 0001750 0001750 00000015456 11427052602 027312 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser.impl;
import com.google.common.base.Preconditions;
/**
* Holds a state table which is defined as the set of all
* recognized state transitions and the set of characters that
* trigger them.
*
* The logic of what character causes what state transition is derived from
* a base definition written as a Python configuration file in the original
* C++ parser.
*
* This class provides methods to initially build the state table and then
* methods at parsing time to determine the transitions to subsequent states.
*
* Note on characters outside the extended ASCII range: Currently, all state
* transitions in the Python configuration file trigger only on extended
* ASCII characters, that is characters in the Unicode space of [U+0000 to
* U+00FF]. We use that property to design a more efficient state transition
* representation. When receiving characters outside that ASCII range, we
* simply apply the DEFAULT transition for the given state - as we do for any
* character that is not a hot character for that state. If no default
* transition exists, we switch to the Internal Error state.
*
* Technical note: In Java, a {@code char} is a code unit and in some cases
* may not represent a complete Unicode code point. However, when that happens,
* the code units that follow for that code point are all in the surrogate area
* [U+D800 - U+DFFF] and hence outside of the ASCII range and will not trigger
* any incorrect state transitions.
*
* This class is storage-inefficient but it is static at least
* and not generated for each Parser instance.
*/
class ParserStateTable {
/**
* A limit on how many different states we can have in one state table.
* Can be increased should it no longer be sufficient.
*/
private static final int MAX_STATES = 256;
/**
* We only check transitions for (extended) ASCII characters, hence
* characters in the range 0 to MAX_CHARS -1.
*/
private static final int MAX_CHARS = 256;
/**
* Records all state transitions except those identified as DEFAULT
* transitions. It is two dimensional: Stores a target {@code InternalState}
* given a source state (referenced by its numeric ID) and the current
* character.
*/
private final InternalState[][] stateTable;
/**
* Records all DEFAULT state transitions. These are transitions provided
* using the {@code "[:default:]"} syntax in the Python configuration file.
* There can be only one such transition for any given source state, hence
* the array is one dimensional.
*/
private final InternalState[] defaultStateTable;
public ParserStateTable() {
stateTable = new InternalState[MAX_STATES][MAX_CHARS];
defaultStateTable = new InternalState[MAX_STATES];
}
/**
* Returns the state to go to when receiving the current {@code char}
* in the {@code from} state.
* Returns {@code InternalState.INTERNAL_ERROR_STATE} if there is no
* state transition for that character and no default state transition
* for that state.
*
* For ASCII characters, first look-up an explicit state transition for
* the current character. If none is found, look-up a default transition. For
* non-ASCII characters, look-up a default transition only.
*
* @param from the source state
* @param currentChar the character received
* @return the state to move to or {@code InternalState.INTERNAL_ERROR_STATE}
*/
InternalState getNextState(InternalState from, int currentChar) {
// TODO: Consider throwing run-time error here.
if (from == null || currentChar < 0)
return InternalState.INTERNAL_ERROR_STATE;
int id = from.getId();
if (id < 0 || id >= MAX_STATES) {
return InternalState.INTERNAL_ERROR_STATE;
}
InternalState result = null;
if (currentChar < MAX_CHARS) {
result = stateTable[id][currentChar];
}
if (result == null) {
result = defaultStateTable[from.getId()];
}
return result != null ? result : InternalState.INTERNAL_ERROR_STATE;
}
void setExpression(String expr, InternalState from, InternalState to) {
if ((expr == null) || (from == null) || (to == null)) {
return;
}
// This special string indicates a default state transition.
if ("[:default:]".equals(expr)) {
setDefaultDestination(from, to);
return;
}
int i = 0;
while (i < expr.length()) {
// If next char is a '-' which is not the last character of the expr
if (i < (expr.length() - 2) && expr.charAt(i + 1) == '-') {
setRange(from, expr.charAt(i), expr.charAt(i + 2), to);
i += 2;
} else {
setDestination(from, expr.charAt(i), to);
i++;
}
}
}
private void fill(InternalState from, InternalState to) {
char c;
for (c = 0; c < MAX_CHARS; c++) {
setDestination(from, c, to);
}
}
private void setDefaultDestination(InternalState from, InternalState to) {
Preconditions.checkNotNull(from); // Developer error if it triggers
Preconditions.checkNotNull(to); // Developer error if it triggers
int id = from.getId();
if ((id < 0) || (id >= MAX_STATES)) {
return;
}
// TODO: Consider asserting if there was a state transition defined.
defaultStateTable[from.getId()] = to;
}
private void setDestination(InternalState from, char chr, InternalState to) {
Preconditions.checkNotNull(from); // Developer error if it triggers
Preconditions.checkNotNull(to); // Developer error if it triggers
Preconditions.checkArgument(chr >= 0 && chr < MAX_CHARS,
"char must be in ASCII set: %c", chr);
int id = from.getId();
if ((id < 0) || (id >= MAX_STATES)) {
return;
}
stateTable[from.getId()][chr] = to;
}
private void setRange(InternalState from, char start, char end,
InternalState to) {
// Developer error if either trigger.
Preconditions.checkArgument(start >= 0 && start < MAX_CHARS,
"char must be in ASCII set: %c", start);
Preconditions.checkArgument(end >= 0 && end < MAX_CHARS,
"char must be in ASCII set: %c", end);
char c;
for (c = start; c <= end; c++) {
setDestination(from, c, to);
}
}
}
jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/impl/StateTableTransition.java 0000644 0001750 0001750 00000004251 11427052602 030177 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser.impl;
import com.google.common.base.Preconditions;
/**
* Holds one state transition as derived from a Python configuration
* file. A state transition is a triplet as follows:
* For example, the triplet ("a-z123", A, B) will cause the
* state to go from A to B for any character that is either 1,2,3 or in
* the range a-z inclusive.
*/
class StateTableTransition {
private final String expression;
private final InternalState from;
private final InternalState to;
/**
* Returns the full state of the {@code StateTableTransition} in a
* human readable form. The format of the returned {@code String} is not
* specified and is subject to change.
*
* @return full state of the {@code StateTableTransition}
*/
@Override
public String toString() {
return String.format("Expression: %s; From: %s; To: %s",
expression, from, to);
}
StateTableTransition(String expression, InternalState from,
InternalState to) {
// Developer error if any triggers.
Preconditions.checkNotNull(expression);
Preconditions.checkNotNull(from);
Preconditions.checkNotNull(to);
this.expression = expression;
this.from = from;
this.to = to;
}
String getExpression() {
return expression;
}
InternalState getFrom() {
return from;
}
InternalState getTo() {
return to;
}
}
jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/impl/JavascriptParserImpl.java 0000644 0001750 0001750 00000032334 11427052602 030204 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser.impl;
import com.google.common.collect.Maps;
import com.google.streamhtmlparser.ExternalState;
import com.google.streamhtmlparser.JavascriptParser;
import com.google.streamhtmlparser.util.HtmlUtils;
import com.google.streamhtmlparser.util.JavascriptTokenBuffer;
import java.util.Map;
/**
* Many comments copied almost verbatim from the original C version.
*/
public class JavascriptParserImpl extends GenericParser
implements JavascriptParser {
final static InternalState JS_TEXT;
final static InternalState JS_Q;
final static InternalState JS_Q_E;
final static InternalState JS_DQ;
final static InternalState JS_DQ_E;
final static InternalState JS_SLASH;
final static InternalState JS_REGEXP_SLASH;
final static InternalState JS_REGEXP;
final static InternalState JS_REGEXP_BRK;
final static InternalState JS_REGEXP_BRK_E;
final static InternalState JS_REGEXP_E;
final static InternalState JS_COM_LN;
final static InternalState JS_COM_ML;
final static InternalState JS_COM_ML_CLOSE;
final static InternalState JS_COM_AFTER;
static {
JS_TEXT = InternalState.getInstanceJavascript("JS_TEXT");
JS_Q = InternalState.getInstanceJavascript("JS_Q");
JS_Q_E = InternalState.getInstanceJavascript("JS_Q_E");
JS_DQ = InternalState.getInstanceJavascript("JS_DQ");
JS_DQ_E = InternalState.getInstanceJavascript("JS_DQ_E");
JS_SLASH = InternalState.getInstanceJavascript("JS_SLASH");
JS_REGEXP = InternalState.getInstanceJavascript("JS_REGEXP");
JS_REGEXP_SLASH = InternalState.getInstanceJavascript("JS_REGEXP_SLASH");
JS_REGEXP_E = InternalState.getInstanceJavascript("JS_REGEXP_E");
JS_REGEXP_BRK = InternalState.getInstanceJavascript("JS_REGEXP_BRK");
JS_REGEXP_BRK_E = InternalState.getInstanceJavascript("JS_REGEXP_BRK_E");
JS_COM_LN = InternalState.getInstanceJavascript("COMMENT_LN");
JS_COM_ML = InternalState.getInstanceJavascript("COMMENT_ML");
JS_COM_ML_CLOSE = InternalState.getInstanceJavascript("COMMENT_ML_CLOSE");
JS_COM_AFTER = InternalState.getInstanceJavascript("COMMENT_AFTER");
}
private static final Map Comment copied verbatim from the corresponding C-version.
*
* Implements the logic to figure out if this slash character is a
* division operator or if it opens a regular expression literal.
* This is heavily inspired by the syntactic resynchronization
* for javascript 2.0:
*
* When we receive a '/', we look at the previous non space character
* to figure out if it's the ending of a punctuator that can precede a
* regexp literal, in which case we assume the current '/' is part of a
* regular expression literal (or the opening of a javascript comment,
* but that part is dealt with in the state machine). The exceptions to
* this are unary operators, so we look back a second character to rule
* out '++' and '--'.
*
* Although it is not straightforward to figure out if the binary
* operator is a postfix of the previous expression or a prefix of the
* regular expression, we rule out the later as it is an uncommon practice.
*
* If we ruled out the previous token to be a valid regexp preceding
* punctuator, we extract the last identifier in the buffer and match
* against a list of keywords that are known to precede expressions in
* the grammar. If we get a match on any of these keywords, then we are
* opening a regular expression, if not, then we have a division operator.
*
* Known cases that are accepted by the grammar but we handle
* differently, although I (falmeida) don't believe there is a
* legitimate usage for those:
* Division of a regular expression: var result = /test/ / 5;
* Prefix unary increment of a regular expression: var result = ++/test/;
* Division of an object literal: { a: 1 } /x/.exec('x');
*
* @param state being entered to
* @param input character being processed
* @return state next state to go to, may be the same as the one we
* were called with
*
* http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html>
* Syntactic Resynchronization
*/
private InternalState enterStateJsSlash(InternalState state, char input) {
InternalState nextState = state;
int position = -1;
// Consume the last whitespace
if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(position))) {
--position;
}
switch (ccBuffer.getChar(position)) {
// Ignore unary increment
case '+':
if (ccBuffer.getChar(position - 1) != '+') {
nextState = JS_REGEXP_SLASH;
}
break;
case '-':
// Ignore unary decrement
if (ccBuffer.getChar(position - 1) != '-') {
nextState = JS_REGEXP_SLASH;
}
break;
// List of punctuator endings except ), ], }, + and - *
case '=':
case '<':
case '>':
case '&':
case '|':
case '!':
case '%':
case '*':
case '/':
case ',':
case ';':
case '?':
case ':':
case '^':
case '~':
case '{':
case '(':
case '[':
case '}':
case '\0':
nextState = JS_REGEXP_SLASH;
break;
default:
String lastIdentifier = ccBuffer.getLastIdentifier();
if (lastIdentifier != null && HtmlUtils
.isJavascriptRegexpPrefix(lastIdentifier)) {
nextState = JS_REGEXP_SLASH;
}
}
ccBuffer.appendChar(input);
return nextState;
}
/**
* Called at the end of a javascript comment.
*
* When we open a comment, the initial '/' was inserted into the ring
* buffer, but it is not a token and should be considered whitespace
* for parsing purposes.
*
* When we first saw the '/' character, we didn't yet know if it was
* the beginning of a comment, a division operator, or a regexp.
*
* In this function we just replace the inital '/' with a whitespace
* character, unless we had a preceding whitespace character, in which
* case we just remove the '/'. This is needed to ensure all spaces in
* the buffer are correctly folded.
*/
private void enterStateJsCommentAfter() {
if (HtmlUtils.isJavascriptWhitespace(ccBuffer.getChar(-2))) {
ccBuffer.popChar();
} else {
ccBuffer.setChar(-1, ' ');
}
}
private void inStateJsText(char input) {
ccBuffer.appendChar(input);
}
// ======================================================= //
// SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE. //
// ======================================================= //
private static void registerMapping(InternalState internalState,
ExternalState externalState) {
STATE_MAPPING.put(internalState, externalState);
}
private static void initializeStateMapping() {
// Each parser implementation must map the error state appropriately.
registerMapping(InternalState.INTERNAL_ERROR_STATE,
JavascriptParser.STATE_ERROR);
registerMapping(JS_TEXT, JavascriptParser.STATE_TEXT);
registerMapping(JS_Q, JavascriptParser.STATE_Q);
registerMapping(JS_Q_E, JavascriptParser.STATE_Q);
registerMapping(JS_DQ, JavascriptParser.STATE_DQ);
registerMapping(JS_DQ_E, JavascriptParser.STATE_DQ);
registerMapping(JS_SLASH, JavascriptParser.STATE_TEXT);
registerMapping(JS_REGEXP_SLASH, JavascriptParser.STATE_TEXT);
registerMapping(JS_REGEXP, JavascriptParser.STATE_REGEXP);
registerMapping(JS_REGEXP_BRK,JavascriptParser.STATE_REGEXP);
registerMapping(JS_REGEXP_BRK_E, JavascriptParser.STATE_REGEXP);
registerMapping(JS_REGEXP_E,JavascriptParser.STATE_REGEXP);
registerMapping(JS_COM_LN, JavascriptParser.STATE_COMMENT);
registerMapping(JS_COM_ML, JavascriptParser.STATE_COMMENT);
registerMapping(JS_COM_ML_CLOSE, JavascriptParser.STATE_COMMENT);
registerMapping(JS_COM_AFTER, JavascriptParser.STATE_TEXT);
}
private static void registerTransition(String expression,
InternalState source,
InternalState to) {
// It seems to silly to go through a StateTableTransition here
// but it adds extra data checking.
StateTableTransition stt = new StateTableTransition(expression,
source, to);
STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
stt.getTo());
}
private static void initializeParserStateTable() {
registerTransition("[:default:]", JS_COM_AFTER, JS_TEXT);
registerTransition("/", JS_COM_AFTER, JS_SLASH);
registerTransition("\"", JS_COM_AFTER, JS_DQ);
registerTransition("\'", JS_COM_AFTER, JS_Q);
registerTransition("[:default:]", JS_COM_ML_CLOSE, JS_COM_ML);
registerTransition("/", JS_COM_ML_CLOSE,JS_COM_AFTER);
registerTransition("[:default:]", JS_COM_ML, JS_COM_ML);
registerTransition("*", JS_COM_ML, JS_COM_ML_CLOSE);
registerTransition("[:default:]", JS_COM_LN,JS_COM_LN);
registerTransition("\n", JS_COM_LN,JS_COM_AFTER);
registerTransition("[:default:]", JS_REGEXP_E, JS_REGEXP);
registerTransition("[:default:]", JS_REGEXP_BRK_E, JS_REGEXP_BRK);
registerTransition("[:default:]", JS_REGEXP_BRK, JS_REGEXP_BRK);
registerTransition("]", JS_REGEXP_BRK, JS_REGEXP);
registerTransition("\\", JS_REGEXP_BRK, JS_REGEXP_BRK_E);
registerTransition("[:default:]", JS_REGEXP, JS_REGEXP);
registerTransition("/", JS_REGEXP, JS_TEXT);
registerTransition("[", JS_REGEXP, JS_REGEXP_BRK);
registerTransition("\\", JS_REGEXP, JS_REGEXP_E);
registerTransition("[:default:]", JS_REGEXP_SLASH, JS_REGEXP);
registerTransition("[", JS_REGEXP_SLASH, JS_REGEXP_BRK);
registerTransition("\\", JS_REGEXP_SLASH, JS_REGEXP_E);
registerTransition("*", JS_REGEXP_SLASH, JS_COM_ML);
registerTransition("/", JS_REGEXP_SLASH, JS_COM_LN);
registerTransition("[:default:]", JS_SLASH, JS_TEXT);
registerTransition("*", JS_SLASH, JS_COM_ML);
registerTransition("/", JS_SLASH, JS_COM_LN);
registerTransition("[:default:]", JS_DQ_E,JS_DQ);
registerTransition("[:default:]", JS_DQ,JS_DQ);
registerTransition("\"", JS_DQ, JS_TEXT);
registerTransition("\\", JS_DQ, JS_DQ_E);
registerTransition("[:default:]", JS_Q_E,JS_Q);
registerTransition("[:default:]", JS_Q,JS_Q);
registerTransition("\'", JS_Q, JS_TEXT);
registerTransition("\\", JS_Q, JS_Q_E);
registerTransition("[:default:]", JS_TEXT, JS_TEXT);
registerTransition("/", JS_TEXT, JS_SLASH);
registerTransition("\"", JS_TEXT, JS_DQ);
registerTransition("\'", JS_TEXT, JS_Q);
}
} jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/ParseException.java 0000644 0001750 0001750 00000002572 11427052602 026070 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser;
/**
* Checked exception thrown on an unrecoverable error during parsing.
*
* @see Parser#parse(String)
*/
public class ParseException extends Exception {
/**
* Constructs an {@code ParseException} with no detail message.
*/
public ParseException() {}
/**
* Constructs an {@code ParseException} with a detail message obtained
* from the supplied message and the parser's line and column numbers.
* @param parser the {@code Parser} that triggered the exception
* @param msg the error message
*/
public ParseException(Parser parser, String msg) {
super(String.format("At line: %d (col: %d); %s",
parser.getLineNumber(),
parser.getColumnNumber(),
msg));
}
}
jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/JavascriptParserFactory.java 0000644 0001750 0001750 00000002560 11427052602 027747 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser;
import com.google.streamhtmlparser.impl.JavascriptParserImpl;
/**
* A factory class to obtain instances of an Note that we do not typically expect a caller of this package to require
* an instance of a Position must be negative where -1 is the index of the last
* character in the buffer.
*
* @param position The index into the buffer
*
* @return character at the requested index
*/
public char getChar(int position) {
assert(position < 0); // Developer error if it triggers.
int absolutePosition = getAbsolutePosition(position);
if (absolutePosition < 0) {
return '\0';
}
return buffer[absolutePosition];
}
/**
* Sets the given {@code input} at the given {@code position} of the buffer.
* Returns {@code true} if we succeeded or {@code false} if we
* failed (i.e. the write was beyond the buffer boundary).
*
* Index positions are negative where -1 is the index of the
* last character in the buffer.
*
* @param position The index at which to set the character
* @param input The character to set in the buffer
* @return {@code true} if we succeeded, {@code false} otherwise
*/
public boolean setChar(int position, char input) {
assert(position < 0); // Developer error if it triggers.
int absolutePosition = getAbsolutePosition(position);
if (absolutePosition < 0) {
return false;
}
buffer[absolutePosition] = input;
return true;
}
/**
* Returns the last javascript identifier/keyword in the buffer.
*
* @return the last identifier or {@code null} if none was found
*/
public String getLastIdentifier() {
int end = -1;
if (HtmlUtils.isJavascriptWhitespace(getChar(-1))) {
end--;
}
int position;
for (position = end; HtmlUtils.isJavascriptIdentifier(getChar(position));
position--) {
}
if ((position + 1) >= end) {
return null;
}
return slice(position + 1, end);
}
/**
* Returns a slice of the buffer delimited by the given indices.
*
* The start and end indexes represent the start and end of the
* slice to copy. If the start argument extends beyond the beginning
* of the buffer, the slice will only contain characters
* starting from the beginning of the buffer.
*
* @param start The index of the first character the copy
* @param end the index of the last character to copy
*
* @return {@code String} between the given indices
*/
public String slice(int start, int end) {
// Developer error if any of the asserts below fail.
Preconditions.checkArgument(start <= end);
Preconditions.checkArgument(start < 0);
Preconditions.checkArgument(end < 0);
StringBuffer output = new StringBuffer();
for (int position = start; position <= end; position++) {
char c = getChar(position);
if (c != '\0') {
output.append(c);
}
}
return new String(output);
}
/**
* Returns the position relative to the start of the buffer or -1
* if the position is past the size of the buffer.
*
* @param position the index to be translated
* @return the position relative to the start of the buffer
*/
private int getAbsolutePosition(int position) {
assert (position < 0); // Developer error if it triggers.
if (position <= -buffer.length) {
return -1;
}
int len = endIndex - startIndex;
if (len < 0) {
len += buffer.length;
}
if (position < -len) {
return -1;
}
int absolutePosition = (position + endIndex) % buffer.length;
if (absolutePosition < 0) {
absolutePosition += buffer.length;
}
return absolutePosition;
}
}
jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/util/CharacterRecorder.java 0000644 0001750 0001750 00000012114 11427052602 027467 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser.util;
/**
* Records (stores) characters supplied one at a time conditional on
* whether recording is currently enabled.
*
* When {@link #maybeRecord(char)} is called, it will add the
* supplied character to the recording buffer but only if
* recording is in progress. This is useful in our
* {@link com.google.security.streamhtmlparser.HtmlParser}
* as the caller logic to enable/disable recording is decoupled from the logic
* of recording.
*
* This is a specialized class - of no use to external code -
* which aims to be 100% compatible with the corresponding logic
* in the C-version of the HtmlParser, specifically in
* The The functionality exposed is designed to be 100% compatible with
* the corresponding logic in the C-version of the HtmlParser as such
* we are particularly concerned with cross-language compatibility.
*
* Note: The words {@code Javascript} and {@code ECMAScript} are used
* interchangeably unless otherwise noted.
*/
public final class HtmlUtils {
/**
* static utility class
*/
private HtmlUtils() {
} // COV_NF_LINE
/**
* Indicates the type of content contained in the {@code content} HTML
* attribute of the {@code meta} HTML tag. Used by
* {@link HtmlUtils#parseContentAttributeForUrl(String)}.
* The values are:
* The token {@code void} was added to the list. Several keywords are
* defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic
* simple we do not differentiate on the version and bundle them all together.
*/
private static final Set Currently returns {@code true} for any attribute name that starts
* with "on" which is not exactly correct but we trust a developer to
* not use non-spec compliant attribute names (e.g. onbogus).
*
* @param attribute the name of an HTML attribute
* @return {@code false} if the input is null or is not an attribute
* that expects javascript code; {@code true}
*/
public static boolean isAttributeJavascript(String attribute) {
return ((attribute != null) && attribute.startsWith("on"));
}
/**
* Determines if the HTML attribute specified expects a {@code style}
* for its value. Currently this is only true for the {@code style}
* HTML attribute.
*
* @param attribute the name of an HTML attribute
* @return {@code true} iff the attribute name is one that expects a
* style for a value; otherwise {@code false}
*/
public static boolean isAttributeStyle(String attribute) {
return "style".equals(attribute);
}
/**
* Determines if the HTML attribute specified expects a {@code URI}
* for its value. For example, both {@code href} and {@code src}
* expect a {@code URI} but {@code style} does not. Returns
* {@code false} if the attribute given was {@code null}.
*
* @param attribute the name of an HTML attribute
* @return {@code true} if the attribute name is one that expects
* a URI for a value; otherwise {@code null}
*
* @see #ATTRIBUTE_EXPECTS_URI
*/
public static boolean isAttributeUri(String attribute) {
return ATTRIBUTE_EXPECTS_URI.contains(attribute);
}
/**
* Determines if the specified character is an HTML whitespace character.
* A character is an HTML whitespace character if and only if it is one
* of the characters below.
* Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in
* particular, this list is quite different from that in
* This function expects to receive the value of the {@code content} HTML
* attribute. This attribute takes on different meanings depending on the
* value of the {@code http-equiv} HTML attribute of the same {@code meta}
* tag. Since we may not have access to the {@code http-equiv} attribute,
* we instead rely on parsing the given value to determine if it contains
* a URL.
*
* The specification of the {@code meta} HTML tag can be found in:
* http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh
*
* We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the
* value contains a URL and whether we are at the start of the URL or past
* the start. We are at the start of the URL if and only if one of the two
* conditions below is true:
* Examples:
* Decodes (unescapes) HTML entities with the complication that these
* are received one character at a time hence must be stored temporarily.
* Also, we may receive some "junk" characters before the actual
* entity which we will discard.
*
* This class is designed to be 100% compatible with the corresponding
* logic in the C-version of the
* {@link com.google.security.streamhtmlparser.HtmlParser}, found
* in Valid HTML entities have one of the following three forms:
* A
* In the unlikely case that this class fails to initialize properly
* (a developer error), an error is emitted to the error console and the logs
* and the specialized parser creation methods will throw
* an {@link AssertionError} on all invokations.
*/
public class HtmlParserFactory {
private static final Logger logger =
Logger.getLogger(HtmlParserFactory.class.getName());
/**
* To provide additional options when creating an {@code HtmlParser} using
* {@link HtmlParserFactory#createParserInAttribute(HtmlParser.ATTR_TYPE,
* boolean, Set)}
*/
public enum AttributeOptions {
/**
* Indicates that the attribute value is Javascript-quoted. Only takes
* effect for Javascript-accepting attributes - as identified by
* {@link HtmlParser.ATTR_TYPE#JS} - and only when the attribute is also
* HTML quoted.
*/
JS_QUOTED,
/**
* Indicates the attribute value is only a part of a URL as opposed to a
* full URL. In particular, the value is not at the start of a URL and
* hence does not necessitate validation of the URL scheme.
* Only valid for URI-accepting attributes - as identified by
* {@link HtmlParser.ATTR_TYPE#URI}.
*/
URL_PARTIAL,
}
/**
* To provide additional options when creating an {@code HtmlParser} using
* {@link HtmlParserFactory#createParserInMode(HtmlParser.Mode, Set)}
*/
public enum ModeOptions {
/**
* Indicates that the parser is inside a quoted {@code String}. Only
* valid in the {@link HtmlParser.Mode#JS} mode.
*/
JS_QUOTED
}
private static final HtmlParser parserInDefaultAttr = createParser();
private static final HtmlParser parserInDefaultAttrQ = createParser();
private static final HtmlParser parserInUriAttrComplete = createParser();
private static final HtmlParser parserInUriAttrQComplete = createParser();
private static final HtmlParser parserInUriAttrPartial = createParser();
private static final HtmlParser parserInUriAttrQPartial = createParser();
private static final HtmlParser parserInJsAttr = createParser();
private static final HtmlParser parserInJsAttrQ = createParser();
private static final HtmlParser parserInQJsAttr = createParser();
private static final HtmlParser parserInStyleAttr = createParser();
private static final HtmlParser parserInStyleAttrQ = createParser();
private static final HtmlParser parserInJsQ = createParser();
/**
* Protects all the createParserXXX methods by throwing a run-time exception
* if this class failed to initialize properly.
*/
private static boolean initSuccess = false;
static {
try {
initializeParsers();
initSuccess = true;
} catch (ParseException e) {
// Log a severe error and print it to stderr along with a stack trace.
String error = HtmlParserFactory.class.getName() +
" Failed initialization: " + e.getMessage();
logger.severe(error);
System.err.println(error);
e.printStackTrace();
}
}
// Static class.
private HtmlParserFactory() {
} // COV_NF_LINE
/**
* Returns an {@code HtmlParser} object ready to parse HTML input.
*
* @return an {@code HtmlParser} in the provided mode
*/
public static HtmlParser createParser() {
return new HtmlParserImpl();
}
/**
* Returns an {@code HtmlParser} object initialized with the
* requested Mode. Provide non {@code null} options to provide
* a more precise initialization with the desired Mode.
*
* @param mode the mode to reset the parser with
* @param options additional options or {@code null} for none
* @return an {@code HtmlParser} in the provided mode
* @throws AssertionError when this class failed to initialize
*/
public static HtmlParser createParserInMode(HtmlParser.Mode mode,
Set For example, to create a parser in a state akin to that
* after the parser has parsed "<a href=\"", invoke:
* You must provide the proper value of quoting or the parser
* will go into an unexpected state.
* As a special-case, when called with the {@code HtmlParser.ATTR_TYPE}
* of {@code HtmlParser.ATTR_TYPE.NONE}, the parser is created in a state
* inside an HTML tag where it expects an attribute name not an attribute
* value. It becomes equivalent to a parser initialized in the
* {@code HTML_IN_TAG} mode.
*
* @param attrtype the attribute type which the parser should be in
* @param quoted whether the attribute value is enclosed in double quotes
* @param options additional options or {@code null} for none
* @return an {@code HtmlParser} initialized in the given attribute type
* and quoting
* @throws AssertionError when this class failed to initialize
*/
public static HtmlParser createParserInAttribute(
HtmlParser.ATTR_TYPE attrtype,
boolean quoted, Set In the very unexpected case of the parsing failing (developer error),
* this class will fail to initialize properly.
*
* In addition:
* ParserStateTable
which is
* derived from a state-machine configuration file in the original C++ parser.
*
* InternalState
instances are
* package-scope only as they are only needed by HtmlParserImpl
* and JavascriptParserImpl
.
*/
class InternalState {
// An InternalState to represent an error condition for all parsers.
static final InternalState INTERNAL_ERROR_STATE = new InternalState();
// MAX_ID and FIRST_ID are only used for asserts against developer error.
private static final int MAX_ID = 255;
private static final int FIRST_ID = 1;
private static AtomicInteger htmlStates = new AtomicInteger(FIRST_ID);
private static AtomicInteger javascriptStates = new AtomicInteger(FIRST_ID);
private final String name;
private final int id;
/**
* @param name the {@code String} identifier for this state
* @param id the integer identiifer for this state, guaranteed to be unique
*/
private InternalState(String name, int id) {
Preconditions.checkNotNull(name);
Preconditions.checkArgument(id >= FIRST_ID);
Preconditions.checkArgument(id <= MAX_ID);
this.name = name;
this.id = id;
}
/**
* Used only for the error state. Bypasses assert checks.
*/
private InternalState() {
name = "InternalStateError";
id = 0;
}
/**
* @return {@code String} name of that state.
*/
public String getName() {
return name;
}
/**
* @return {@code int} id of that state.
*/
public int getId() {
return id;
}
/**
* @return {@code String} representation of that object, the format
* may change.
*/
@Override
public String toString() {
return String.format("InternalState: Name: %s; Id: %d", name, id);
}
/**
* Obtain a new {@code InternalState} instance for the HTML parser.
*
* @param name a unique identifier for this state useful during debugging
* @return a new {@code InternalState} object
*/
static InternalState getInstanceHtml(String name) {
int htmlStateId = htmlStates.getAndIncrement();
return new InternalState(name, htmlStateId);
}
/**
* Obtain a new InternalState
instance for the Javascript parser.
*
* @param name A unique identifier for this state useful during debugging
* @return a new {@code InternalState} object
*/
static InternalState getInstanceJavascript(String name) {
int javascriptStateId = javascriptStates.getAndIncrement();
return new InternalState(name, javascriptStateId);
}
}
jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/impl/HtmlParserImpl.java 0000644 0001750 0001750 00000073751 11427052602 027012 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser.impl;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.streamhtmlparser.ExternalState;
import com.google.streamhtmlparser.HtmlParser;
import com.google.streamhtmlparser.ParseException;
import com.google.streamhtmlparser.util.CharacterRecorder;
import com.google.streamhtmlparser.util.EntityResolver;
import com.google.streamhtmlparser.util.HtmlUtils;
import java.util.Map;
/**
* A custom specialized parser - ported from the main C++ version - used to
* implement context-aware escaping of run-time data in web-application
* templates.
*
* InternalState
and
* ExternalState
is a many-to-one relationship.
*/
private static final InternalState TEXT;
private static final InternalState TAG_START;
private static final InternalState TAG_NAME;
private static final InternalState DECL_START;
private static final InternalState DECL_BODY;
private static final InternalState COM_OPEN;
private static final InternalState COM_BODY;
private static final InternalState COM_DASH;
private static final InternalState COM_DASH_DASH;
private static final InternalState PI;
private static final InternalState PI_MAY_END;
private static final InternalState TAG_SPACE;
private static final InternalState TAG_CLOSE;
private static final InternalState ATTR;
private static final InternalState ATTR_SPACE;
private static final InternalState VALUE;
private static final InternalState VALUE_TEXT;
private static final InternalState VALUE_Q_START;
private static final InternalState VALUE_Q;
private static final InternalState VALUE_DQ_START;
private static final InternalState VALUE_DQ;
private static final InternalState CDATA_COM_START;
private static final InternalState CDATA_COM_START_DASH;
private static final InternalState CDATA_COM_BODY;
private static final InternalState CDATA_COM_DASH;
private static final InternalState CDATA_COM_DASH_DASH;
private static final InternalState CDATA_TEXT;
private static final InternalState CDATA_LT;
private static final InternalState CDATA_MAY_CLOSE;
private static final InternalState JS_FILE;
private static final InternalState CSS_FILE;
static {
TEXT = InternalState.getInstanceHtml("TEXT");
TAG_START = InternalState.getInstanceHtml("TAG_START");
TAG_NAME = InternalState.getInstanceHtml("TAG_NAME");
DECL_START = InternalState.getInstanceHtml("DECL_START");
DECL_BODY = InternalState.getInstanceHtml("DECL_BODY");
COM_OPEN = InternalState.getInstanceHtml("COM_OPEN");
COM_BODY = InternalState.getInstanceHtml("COM_BODY");
COM_DASH = InternalState.getInstanceHtml("COM_DASH");
COM_DASH_DASH = InternalState.getInstanceHtml("COM_DASH_DASH");
PI =InternalState.getInstanceHtml("PI");
PI_MAY_END = InternalState.getInstanceHtml("PI_MAY_END");
TAG_SPACE = InternalState.getInstanceHtml("TAG_SPACE");
TAG_CLOSE = InternalState.getInstanceHtml("TAG_CLOSE");
ATTR = InternalState.getInstanceHtml("ATTR");
ATTR_SPACE = InternalState.getInstanceHtml("ATTR_SPACE");
VALUE = InternalState.getInstanceHtml("VALUE");
VALUE_TEXT = InternalState.getInstanceHtml("VALUE_TEXT");
VALUE_Q_START = InternalState.getInstanceHtml("VALUE_Q_START");
VALUE_Q = InternalState.getInstanceHtml("VALUE_Q");
VALUE_DQ_START = InternalState.getInstanceHtml("VALUE_DQ_START");
VALUE_DQ = InternalState.getInstanceHtml("VALUE_DQ");
CDATA_COM_START = InternalState.getInstanceHtml("CDATA_COM_START");
CDATA_COM_START_DASH =
InternalState.getInstanceHtml("CDATA_COM_START_DASH");
CDATA_COM_BODY = InternalState.getInstanceHtml("CDATA_COM_BODY");
CDATA_COM_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH");
CDATA_COM_DASH_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH_DASH");
CDATA_TEXT = InternalState.getInstanceHtml("CDATA_TEXT");
CDATA_LT = InternalState.getInstanceHtml("CDATA_LT");
CDATA_MAY_CLOSE = InternalState.getInstanceHtml("CDATA_MAY_CLOSE");
JS_FILE = InternalState.getInstanceHtml("JS_FILE");
CSS_FILE = InternalState.getInstanceHtml("CSS_FILE");
}
private static final Map
*
*
* @return {@code true} if and only if the parser is inside CSS
*/
@Override
public boolean inCss() {
return (currentState == CSS_FILE
|| (getState() == STATE_VALUE
&& (getAttributeType() == ATTR_TYPE.STYLE))
|| ("style".equals(getTag())));
}
@Override
public ATTR_TYPE getAttributeType() {
String attribute = getAttribute();
if (!inAttribute()) {
return ATTR_TYPE.NONE;
}
if (HtmlUtils.isAttributeJavascript(attribute)) {
return ATTR_TYPE.JS;
}
if (HtmlUtils.isAttributeUri(attribute)) {
return ATTR_TYPE.URI;
}
if (HtmlUtils.isAttributeStyle(attribute)) {
return ATTR_TYPE.STYLE;
}
// Special logic to handle the "content" attribute of the "meta" tag.
if ("meta".equals(getTag()) && "content".equals(getAttribute())) {
HtmlUtils.META_REDIRECT_TYPE redirectType =
HtmlUtils.parseContentAttributeForUrl(getValue());
if (redirectType == HtmlUtils.META_REDIRECT_TYPE.URL_START ||
redirectType == HtmlUtils.META_REDIRECT_TYPE.URL)
return ATTR_TYPE.URI;
}
return ATTR_TYPE.REGULAR;
}
@Override
public ExternalState getJavascriptState() {
return jsParser.getState();
}
@Override
public boolean isAttributeQuoted() {
return (currentState == VALUE_Q_START
|| currentState == VALUE_Q
|| currentState == VALUE_DQ_START
|| currentState == VALUE_DQ);
}
@Override
public String getTag() {
return tag.getContent().toLowerCase();
}
@Override
public String getAttribute() {
return inAttribute() ? attr.getContent().toLowerCase() : "";
}
@Override
public String getValue() {
return (getState() == STATE_VALUE) ? value.getContent() : "";
}
@Override
public int getValueIndex() {
if (getState() != STATE_VALUE) {
return 0;
}
return valueIndex;
}
@Override
public boolean isUrlStart() {
// False when not inside an HTML attribute value
if (getState() != STATE_VALUE) {
return false;
}
// Or when the HTML attribute is not of URI type.
if (getAttributeType() != ATTR_TYPE.URI) {
return false;
}
// Or when we received an InsertText() directive at the start of a URL.
if (textInsideUrlValue) {
return false;
}
if ("meta".equals(getTag())) {
// At this point, we know we are in the "content" attribute
// or we would not have the URI attribute type.
return (HtmlUtils.parseContentAttributeForUrl(getValue()) ==
HtmlUtils.META_REDIRECT_TYPE.URL_START);
}
// For all other URI attributes, check if we are at index 0.
return (getValueIndex() == 0);
}
/**
* {@inheritDoc}
*
* Resets the state of the parser to a state consistent with the
* {@code Mode} provided. This will reset finer-grained state
* information back to a default value, hence use only when
* you want to parse text from a very clean slate.
*
*
*
*
* @throws ParseException if an unrecoverable error occurred during parsing
*/
@Override
public void insertText() throws ParseException {
// Case: Inside URL attribute value.
if (getState() == STATE_VALUE
&& getAttributeType() == ATTR_TYPE.URI
&& isUrlStart()) {
textInsideUrlValue = true;
}
// Case: Before parsing any attribute value.
if (currentState == VALUE) {
setNextState(VALUE_TEXT);
}
}
@Override
protected InternalState handleEnterState(InternalState currentState,
InternalState expectedNextState,
char input) {
InternalState nextState = expectedNextState;
if (currentState == TAG_NAME) {
enterTagName();
} else if (currentState == ATTR) {
enterAttribute();
} else if (currentState == TAG_CLOSE) {
nextState = tagClose(currentState);
} else if (currentState == CDATA_MAY_CLOSE) {
enterStateCdataMayClose();
} else if (currentState == VALUE) {
enterValue();
} else
if (currentState == VALUE_TEXT || currentState == VALUE_Q
|| currentState == VALUE_DQ) {
enterValueContent();
}
return nextState;
}
@Override
protected InternalState handleExitState(InternalState currentState,
InternalState expectedNextState,
char input) {
InternalState nextState = expectedNextState;
if (currentState == TAG_NAME) {
exitTagName();
} else if (currentState == ATTR) {
exitAttribute();
} else if (currentState == CDATA_MAY_CLOSE) {
nextState = exitStateCdataMayClose(nextState, input);
} else
if ((currentState == VALUE_TEXT) || (currentState == VALUE_Q)
|| (currentState == VALUE_DQ)) {
exitValueContent();
}
return nextState;
}
@Override
protected InternalState handleInState(InternalState currentState,
char input) throws ParseException {
if ((currentState == CDATA_TEXT)
|| (currentState == CDATA_COM_START)
|| (currentState == CDATA_COM_START_DASH)
|| (currentState == CDATA_COM_BODY)
|| (currentState == CDATA_COM_DASH)
|| (currentState == CDATA_COM_DASH_DASH)
|| (currentState == CDATA_LT)
|| (currentState == CDATA_MAY_CLOSE)
|| (currentState == JS_FILE)) {
inStateCdata(input);
} else if ((currentState == VALUE_TEXT)
|| (currentState == VALUE_Q)
|| (currentState == VALUE_DQ)) {
inStateValue(input);
}
return currentState;
}
/**
* Invokes recording on all CharacterRecorder objects. Currently we do
* not check that one and only one of them is recording. I did a fair
* bit of testing on the C++ parser and was not convinced there is
* such a guarantee.
*/
@Override
protected void record(char input) {
attr.maybeRecord(input);
tag.maybeRecord(input);
value.maybeRecord(input);
cdataCloseTag.maybeRecord(input);
}
/**
* Starts recording the name of the HTML tag. Called when the parser
* enters a new tag.
*/
private void enterTagName() {
tag.startRecording();
}
private void exitTagName() {
tag.stopRecording();
String tagString = tag.getContent();
if (!tagString.isEmpty() && tagString.charAt(0) == '/') {
tag.reset();
}
}
/**
* Starts recording the name of the HTML attribute. Called when the parser
* enters a new HTML attribute.
*/
private void enterAttribute() {
attr.startRecording();
}
private void exitAttribute() {
attr.stopRecording();
}
/**
* Tracks the index within the HTML attribute value and initializes
* the javascript parser for attributes that take javascript.
*
* Called when the parser enters a new HTML attribute value.
*/
private void enterValue() {
valueIndex = 0;
textInsideUrlValue = false;
if (HtmlUtils.isAttributeJavascript(getAttribute())) {
entityResolver.reset();
jsParser.reset();
insideJavascript = true;
} else {
insideJavascript = false;
}
}
/**
* Starts recordning the contents of the attribute value.
*
* Called when entering an attribute value.
*/
private void enterValueContent() {
value.startRecording();
}
/**
* Stops the recording of the attribute value and exits javascript
* (in case we were inside it).
*/
private void exitValueContent() {
value.stopRecording();
insideJavascript = false;
}
/**
* Processes javascript after performing entity resolution and updates
* the position within the attribute value.
* If the status of the entity resolution is IN_PROGRESS
,
* we don't invoke the javascript parser.
*
*
*
*
* JavascriptParser
.
* Currently each instance is a new object given these are fairly
* light-weight.
*
* JavascriptParser
since one is already
* embedded in the more general-purpose HtmlParser
. We still
* make it possible to require one as it may be useful for more
* specialized needs.
*
*/
public class JavascriptParserFactory {
public static JavascriptParser getInstance() {
return new JavascriptParserImpl();
}
// Static class.
private JavascriptParserFactory() {
} // COV_NF_LINE
} jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/util/ 0000755 0001750 0001750 00000000000 11767454573 023267 5 ustar chris chris jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/util/JavascriptTokenBuffer.java 0000644 0001750 0001750 00000020256 11427052602 030354 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser.util;
import com.google.common.base.Preconditions;
import java.util.Arrays;
/**
* Implements a circular (ring) buffer of characters with specialized
* application logic in order to determine the context of some
* Javascript content that is being parsed.
*
* This is a specialized class - of no use to external code -
* which aims to be 100% compatible with the corresponding logic
* in the C-version of the HtmlParser, specifically
* jsparser.c
. In particular:
*
*
*/
public class JavascriptTokenBuffer {
/**
* Size of the ring buffer used to lookup the last token in the javascript
* stream. The size is somewhat arbitrary but must be larger than
* the biggest token we want to lookup plus three: Two delimiters plus
* an empty ring buffer slot.
*/
private static final int BUFFER_SIZE = 18;
/** Storage implementing the circular buffer. */
private final char[] buffer;
/** Index of the first item in our circular buffer. */
private int startIndex;
/** Index of the last item in our circular buffer. */
private int endIndex;
/**
* Constructs an empty javascript token buffer. The size is fixed,
* see {@link #BUFFER_SIZE}.
*/
public JavascriptTokenBuffer() {
buffer = new char[BUFFER_SIZE];
startIndex = 0;
endIndex = 0;
}
/**
* Constructs a javascript token buffer that is identical to
* the one given. In particular, it has the same size and contents.
*
* @param aJavascriptTokenBuffer the {@code JavascriptTokenBuffer} to copy
*/
public JavascriptTokenBuffer(JavascriptTokenBuffer aJavascriptTokenBuffer) {
buffer = Arrays.copyOf(aJavascriptTokenBuffer.buffer,
aJavascriptTokenBuffer.buffer.length);
startIndex = aJavascriptTokenBuffer.startIndex;
endIndex = aJavascriptTokenBuffer.endIndex;
}
/**
* A simple wrapper over appendChar
, it appends a string
* to the buffer. Sequences of whitespace and newlines
* are folded into one character to save space. Null strings are
* not allowed.
*
* @param input the {@code String} to append, cannot be {@code null}
*/
// TODO: Move to testing since not used in code.
public void appendString(String input) {
if (input == null) {
throw new NullPointerException("input == null is not allowed");
}
for (int i = 0; i < input.length(); i++) {
appendChar(input.charAt(i));
}
}
/**
* Appends a character to the buffer. We fold sequences of whitespace and
* newlines into one to save space.
*
* @param input the {@code char} to append
*/
public void appendChar(char input) {
if (HtmlUtils.isJavascriptWhitespace(input) &&
HtmlUtils.isJavascriptWhitespace(getChar(-1))) {
return;
}
buffer[endIndex] = input;
endIndex = (endIndex + 1) % buffer.length;
if (endIndex == startIndex) {
startIndex = (endIndex + 1) % buffer.length;
}
}
/**
* Returns the last character in the buffer and removes it from the buffer
* or the NUL character '\0' if the buffer is empty.
*
* @return last character in the buffer or '\0' if the buffer is empty
*/
public char popChar() {
if (startIndex == endIndex) {
return '\0';
}
endIndex--;
if (endIndex < 0) {
endIndex += buffer.length;
}
return buffer[endIndex];
}
/**
* Returns the character at a given index in the buffer or nul ('\0')
* if the index is outside the range of the buffer. Such could happen
* if the buffer is not filled enough or the index is larger than the
* size of the buffer.
*
* statemachine.c
. In particular:
*
*
*/
public class CharacterRecorder {
/**
* How many characters can be recorded before stopping to accept new
* ones. Set to one less than in the C-version as we do not need
* to reserve a character for the terminating null.
*/
public static final int RECORDING_BUFFER_SIZE = 255;
/**
* This is where characters provided for recording are stored. Given
* that the CharacterRecorder
object is re-used, might as well
* allocate the full size from the get-go.
*/
private final StringBuilder sb;
/** Holds whether we are currently recording characters or not. */
private boolean recording;
/**
* Constructs an empty character recorder of fixed size currently
* not recording. See {@link #RECORDING_BUFFER_SIZE} for the size.
*/
public CharacterRecorder() {
sb = new StringBuilder(RECORDING_BUFFER_SIZE);
recording = false;
}
/**
* Constructs a character recorder of fixed size that is a copy
* of the one provided. In particular it has the same recording
* setting and the same contents.
*
* @param aCharacterRecorder the {@code CharacterRecorder} to copy
*/
public CharacterRecorder(CharacterRecorder aCharacterRecorder) {
recording = aCharacterRecorder.recording;
sb = new StringBuilder(RECORDING_BUFFER_SIZE);
sb.append(aCharacterRecorder.getContent());
}
/**
* Enables recording for incoming characters. The recording buffer is cleared
* of content it may have contained.
*/
public void startRecording() {
// This is very fast, no memory (re-) allocation will take place.
sb.setLength(0);
recording = true;
}
/**
* Disables recording further characters.
*/
public void stopRecording() {
recording = false;
}
/**
* Records the {@code input} if recording is currently on and we
* have space available in the buffer. If recording is not
* currently on, this method will not perform any action.
*
* @param input the character to record
*/
public void maybeRecord(char input) {
if (recording && (sb.length() < RECORDING_BUFFER_SIZE)) {
sb.append(input);
}
}
/**
* Empties the underlying storage but does not change the recording
* state [i.e whether we are recording or not incoming characters].
*/
public void clear() {
sb.setLength(0);
}
/**
* Empties the underlying storage and resets the recording indicator
* to indicate we are not recording currently.
*/
public void reset() {
clear();
recording = false;
}
/**
* Returns the characters recorded in a {@code String} form. This
* method has no side-effects, the characters remain stored as is.
*
* @return the contents in a {@code String} form
*/
public String getContent() {
return sb.toString();
}
/**
* Returns whether or not we are currently recording incoming characters.
*
* @return {@code true} if we are recording, {@code false} otherwise
*/
public boolean isRecording() {
return recording;
}
/**
* Returns the full state of the object in a human readable form. The
* format of the returned {@code String} is not specified and is
* subject to change.
*
* @return the full state of this object
*/
@Override
public String toString() {
return String.format("In recording: %s; Value: %s", isRecording(),
sb.toString());
}
}
jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/util/HtmlUtils.java 0000644 0001750 0001750 00000034220 11427052602 026034 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser.util;
import com.google.common.collect.ImmutableSortedSet;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
/**
* Utility functions for HTML and Javascript that are most likely
* not interesting to users outside this package.
*
* HtmlParser
will be open-sourced hence we took the
* decision to keep these utilities in this package as well as not to
* leverage others that may exist in the google3
code base.
*
*
*
*/
public enum META_REDIRECT_TYPE {
NONE,
URL_START,
URL
}
/**
* A regular expression matching the format of a {@code content} attribute
* that contains a URL. Used by {@link #parseContentAttributeForUrl}.
*/
private static final String META_REDIRECT_REGEX =
"^\\s*\\d*\\s*;\\s*URL\\s*=\\s*[\'\"]?";
// Safe for use by concurrent threads so we compile once.
private static final Pattern META_REDIRECT_PATTERN =
Pattern.compile(META_REDIRECT_REGEX, Pattern.CASE_INSENSITIVE);
/**
* Set of keywords that can precede a regular expression literal. Taken from:
*
* Language Syntax
*
*
*
*
* Note: The list includes the zero-width space (Space
character
* Tab
character
* Line feed
character
* Carriage Return
character
* Zero-Width Space
character
* ​
)
* which is not included in the C version.
*
* @param chr the {@code char} to check
* @return {@code true} if the character is an HTML whitespace character
*
* White space
*/
public static boolean isHtmlSpace(char chr) {
return HTML_WHITESPACE.contains(chr);
}
/**
* Determines if the specified character is an ECMAScript whitespace or line
* terminator character. A character is a whitespace or line terminator if
* and only if it is one of the characters below:
*
*
*
* Tab
, Vertical Tab
,
* Form Feed
, Space
,
* No-break space
)
* Line Feed
,
* Carriage Return
, Line separator
,
* Paragraph Separator
).
* Character.isWhitespace
.
*
* ECMAScript Language Specification
*
* @param chr the {@code char} to check
* @return {@code true} or {@code false}
*
*/
public static boolean isJavascriptWhitespace(char chr) {
return JAVASCRIPT_WHITESPACE.contains(chr);
}
/**
* Determines if the specified character is a valid character in an
* ECMAScript identifier. This determination is currently not exact,
* in particular:
*
*
*
* We are considering leveraging Character.isJavaIdentifierStart
* and Character.isJavaIdentifierPart
given that Java
* and Javascript follow similar identifier naming rules but we lose
* compatibility with the C-version.
*
* @param chr {@code char} to check
* @return {@code true} if the {@code chr} is a Javascript whitespace
* character; otherwise {@code false}
*/
public static boolean isJavascriptIdentifier(char chr) {
return ((chr >= 'a' && chr <= 'z')
|| (chr >= 'A' && chr <= 'Z')
|| (chr >= '0' && chr <= '9')
|| chr == '_' || chr == '$');
}
/**
* Determines if the input token provided is a valid token prefix to a
* javascript regular expression. The token argument is compared against
* a {@code Set} of identifiers that can precede a regular expression in the
* javascript grammar, and returns {@code true} if the provided
* {@code String} is in that {@code Set}.
*
* @param input the {@code String} token to check
* @return {@code true} iff the token is a valid prefix of a regexp
*/
public static boolean isJavascriptRegexpPrefix(String input) {
return REGEXP_TOKEN_PREFIXS.contains(input);
}
/**
* Encodes the specified character using Ascii for convenient insertion into
* a single-quote enclosed {@code String}. Printable characters
* are returned as-is. Carriage Return, Line Feed, Horizontal Tab,
* back-slash and single quote are all backslash-escaped. All other characters
* are returned hex-encoded.
*
* @param chr {@code char} to encode
* @return an Ascii-friendly encoding of the given {@code char}
*/
public static String encodeCharForAscii(char chr) {
if (chr == '\'') {
return "\\'";
} else if (chr == '\\') {
return "\\\\";
} else if (chr >= 32 && chr <= 126) {
return String.format("%c", chr);
} else if (chr == '\n') {
return "\\n";
} else if (chr == '\r') {
return "\\r";
} else if (chr == '\t') {
return "\\t";
} else {
// Cannot apply a precision specifier for integral types. Specifying
// 0-padded hex-encoding with minimum width of two.
return String.format("\\u%04x", (int)chr);
}
}
/**
* Parses the given {@code String} to determine if it contains a URL in the
* format followed by the {@code content} attribute of the {@code meta}
* HTML tag.
*
*
*
*
*
*
*
* @param value {@code String} to parse
* @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence
* of a URL in the given value
*/
public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) {
if (value == null)
return META_REDIRECT_TYPE.NONE;
Matcher matcher = META_REDIRECT_PATTERN.matcher(value);
if (!matcher.find())
return META_REDIRECT_TYPE.NONE;
// We have more content.
if (value.length() > matcher.end())
return META_REDIRECT_TYPE.URL;
return META_REDIRECT_TYPE.URL_START;
}
}
jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/util/EntityResolver.java 0000644 0001750 0001750 00000022055 11427052602 027110 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser.util;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import java.util.Map;
/**
*
* <meta http-equiv="refresh" content="5; URL=http://www.google.com">
*
*
* <meta http-equiv="refresh" content="5; URL=">
*
*
* <meta http-equiv="content-type" content="text/html">
*
* htmlparser.c
. There are however a few intentional
* differences outlines below:
*
*
*
* processChar
* returns the output {@code String} whereas in Java, we return
* a status code and then provide the {@code String} in a separate
* method getEntity
. It is cleaner as it avoids the
* need to return empty {@code String}s during incomplete processing.
*
*
*
* ⅆ
where dd is a number in decimal (base 10) form.
* &x|Xyy;
where yy is a hex-number (base 16).
* &<html-entity>;
where
* <html-entity>
is one of lt
,
* gt
, amp
, quot
or
* apos
.
* reset
method is provided to facilitate object re-use.
*/
public class EntityResolver {
/**
* Returned in processChar
method.
*
*
*/
public enum Status {
NOT_STARTED("Not Started"),
IN_PROGRESS("In Progress"),
COMPLETED("Completed");
private final String message;
private Status(String message) {
this.message = message;
}
/**
* Returns a brief description of the {@code Status} for
* debugging purposes. The format of the returned {@code String}
* is not fully specified nor guaranteed to remain the same.
*/
@Override
public String toString() {
return message;
}
}
/**
* How many characters to store as we are processing an entity. Once we
* reach that size, we know the entity is definitely invalid. The size
* is higher than needed but keeping it as-is for compatibility with
* the C-version.
*/
private static final int MAX_ENTITY_SIZE = 10;
/**
* Map containing the recognized HTML entities and their decoded values.
* The trailing ';' is not included in the key but it is accounted for.
*/
private static final MapNOT_STARTED
indicates we are still processing
* trailing characters before the start of an entity.
* The caller may want to save the characters it provided us.
* IN_PROGRESS
indicates we are currently processing
* characters part of an entity.
* COMPLETED
indicates we have finished processing
* an entity. The caller can then invoke getEntity
* then re-set the object for future re-use.
* StreamEntityResolver
* in a human readable form. The format of the returned String
* is not specified and is subject to change.
*
* @return full state of this object
*/
@Override
public String toString() {
return String.format("Status: %s; Contents (%d): %s", status.toString(),
sb.length(), sb.toString());
}
/**
* Returns the decoded HTML Entity. Should only be called
* after {@code processChar} returned status {@code COMPLETED}.
*
* @return the decoded HTML Entity or an empty {@code String} if
* we were called with any status other than {@code COMPLETED}
*/
public String getEntity() {
return entity;
}
/**
* Processes a character from the input stream and decodes any html entities
* from that processed input stream.
*
* @param input the {@code char} to process
* @return the processed {@code String}. Typically returns an empty
* {@code String} while awaiting for more characters to complete
* processing of the entity.
*/
public Status processChar(char input) {
// Developer error if the precondition fails.
Preconditions.checkState(status != Status.NOT_STARTED || sb.length() == 0);
if (status == Status.NOT_STARTED) {
if (input == '&') {
sb.append(input);
status = Status.IN_PROGRESS;
}
} else if (status == Status.IN_PROGRESS) {
if ((input == ';') || (HtmlUtils.isHtmlSpace(input))) {
status = Status.COMPLETED;
entity = convertEntity(input);
} else {
if (sb.length() < MAX_ENTITY_SIZE) {
sb.append(input);
} else {
status = Status.COMPLETED;
entity = uncovertedInput(input);
}
}
} else {
// Status.COMPLETED, ignore character, do nothing.
}
return status;
}
/**
* Performs the decoding of a complete HTML entity and saves the
* result back into the buffer.
*
* Numeric Character References
*
* @param terminator the last character read, unused on successful
* conversions since it is the end delimiter of the entity
* @return The decoded entity or the original input if we could not decode it.
*/
private String convertEntity(char terminator) {
// Developer error if the buffer was empty or does not start with '&'.
Preconditions.checkArgument(sb.length() > 0);
Preconditions.checkArgument(sb.charAt(0) == '&');
if (sb.length() > 1) {
if (sb.charAt(1) == '#') {
if (sb.length() <= 2) { // Error => return content as-is.
return uncovertedInput(terminator);
}
try {
if ((sb.charAt(2) == 'x') || (sb.charAt(2) == 'X')) { // Hex NCR
return new String(Character.toChars(
Integer.parseInt(sb.substring(3), 16)));
} else { // Decimal NCR
return new String(Character.toChars(
Integer.parseInt(sb.substring(2))));
}
} catch (NumberFormatException e) {
return uncovertedInput(terminator);
}
}
// See if it matches any of the few recognized entities.
String key = sb.toString();
if (HTML_ENTITIES_MAP.containsKey(key)) {
return HTML_ENTITIES_MAP.get(key);
}
}
// Covers the case of a lonely '&' given or valid/invalid unknown entities.
return uncovertedInput(terminator);
}
private String uncovertedInput(char terminator) {
return String.format("%s%c", sb.toString(), terminator);
}
}
jsilver-1.0.0.dfsg.orig/src/com/google/streamhtmlparser/HtmlParserFactory.java 0000644 0001750 0001750 00000027246 11427052602 026555 0 ustar chris chris /*
* Copyright (C) 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.streamhtmlparser;
import com.google.streamhtmlparser.impl.HtmlParserImpl;
import java.util.Set;
import java.util.logging.Logger;
/**
* A factory class to obtain instances of an {@link HtmlParser}.
* Currently each instance is a new object given these are fairly
* light-weight.
*
*
* createParserInAttribute(HtmlParser.ATTR_TYPE.URI, true)}
*
*
*
*
*
* @throws ParseException if parsing failed.
*/
private static void initializeParsers() throws ParseException {
parserInDefaultAttr.parse("