owasp-java-html-sanitizer-0.1+r88/0000775000175000017500000000000011730105506017461 5ustar jamespagejamespageowasp-java-html-sanitizer-0.1+r88/pom.xml0000664000175000017500000000313711730105506021002 0ustar jamespagejamespage 4.0.0 owasp-java-html-sanitizer owasp-java-html-sanitizer r88 jar OWASP Java HTML Sanitizer A fast and easy to configure HTML Sanitizer written in Java which lets you include HTML authored by third-parties in your web application while protecting against XSS. http://code.google.com/p/owasp-java-html-sanitizer OWASP http://www.owasp.org/ New BSD License http://www.opensource.org/licenses/bsd-license.php com.google.guava guava r09 com.google.code.findbugs jsr305 1.3.9 owasp-java-html-sanitizer-0.1+r88/src/0000775000175000017500000000000011730105506020250 5ustar jamespagejamespageowasp-java-html-sanitizer-0.1+r88/src/main/0000775000175000017500000000000011730105506021174 5ustar jamespagejamespageowasp-java-html-sanitizer-0.1+r88/src/main/java/0000775000175000017500000000000011730105507022116 5ustar jamespagejamespageowasp-java-html-sanitizer-0.1+r88/src/main/java/META-INF/0000775000175000017500000000000011654053470023264 5ustar jamespagejamespageowasp-java-html-sanitizer-0.1+r88/src/main/java/META-INF/MANIFEST.MF0000664000175000017500000000007411654053470024717 0ustar jamespagejamespageManifest-Version: 1.0 Created-By: 1.6.0_26 (Apple Inc.) owasp-java-html-sanitizer-0.1+r88/src/main/java/org/0000775000175000017500000000000011654053470022713 5ustar jamespagejamespageowasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/0000775000175000017500000000000011654053470024044 5ustar jamespagejamespageowasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/0000775000175000017500000000000011654053470025010 5ustar jamespagejamespageowasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/ElementAndAttributePolicies.java0000664000175000017500000000667711654053470033263 0ustar jamespagejamespage// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.Map; import com.google.common.collect.ImmutableMap; import javax.annotation.concurrent.Immutable; /** * Encapsulates all the information needed by the * {@link ElementAndAttributePolicyBasedSanitizerPolicy} to sanitize one kind * of element. */ @Immutable final class ElementAndAttributePolicies { final String elementName; final boolean isVoid; final ElementPolicy elPolicy; final ImmutableMap attrPolicies; final boolean skipIfEmpty; ElementAndAttributePolicies( String elementName, ElementPolicy elPolicy, Map attrPolicies, boolean skipIfEmpty) { this.elementName = elementName; this.isVoid = HtmlTextEscapingMode.isVoidElement(elementName); this.elPolicy = elPolicy; this.attrPolicies = ImmutableMap.copyOf(attrPolicies); this.skipIfEmpty = skipIfEmpty; } ElementAndAttributePolicies and(ElementAndAttributePolicies p) { assert elementName.equals(p.elementName): elementName + " != " + p.elementName; ImmutableMap.Builder joinedAttrPolicies = ImmutableMap.builder(); for (Map.Entry e : this.attrPolicies.entrySet()) { String attrName = e.getKey(); AttributePolicy a = e.getValue(); AttributePolicy b = p.attrPolicies.get(attrName); if (b != null) { a = AttributePolicy.Util.join(a, b); } joinedAttrPolicies.put(attrName, a); } for (Map.Entry e : p.attrPolicies.entrySet()) { String attrName = e.getKey(); if (!this.attrPolicies.containsKey(attrName)) { joinedAttrPolicies.put(attrName, e.getValue()); } } return new ElementAndAttributePolicies( elementName, ElementPolicy.Util.join(elPolicy, p.elPolicy), joinedAttrPolicies.build(), skipIfEmpty || p.skipIfEmpty); } } ././@LongLink0000000000000000000000000000014700000000000011567 Lustar rootrootowasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/FilterUrlByProtocolAttributePolicy.javaowasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/FilterUrlByProtocolAttributePolicy.ja0000664000175000017500000001160611654053470034321 0ustar jamespagejamespage// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import javax.annotation.Nullable; import com.google.common.collect.ImmutableSet; /** * An attribute policy for attributes whose values are URLs that requires that * the value have no protocol or have an allowed protocol. * *

* URLs with protocols must match the protocol set passed to the constructor. * URLs without protocols but which specify an origin different from the * containing page (e.g. {@code //example.org}) are only allowed if the * {@link FilterUrlByProtocolAttributePolicy#allowProtocolRelativeUrls policy} * allows both {@code http} and {@code https} which are normally used to serve * HTML. * Same-origin URLs, URLs without any protocol or authority part are always * allowed. *

* *

* This class assumes that URLs are either hierarchical, or are opaque, but * do not look like they contain an authority portion. *

* * @author Mike Samuel */ @TCB public class FilterUrlByProtocolAttributePolicy implements AttributePolicy { private final ImmutableSet protocols; public FilterUrlByProtocolAttributePolicy( Iterable protocols) { this.protocols = ImmutableSet.copyOf(protocols); } public @Nullable String apply( String elementName, String attributeName, String s) { protocol_loop: for (int i = 0, n = s.length(); i < n; ++i) { switch (s.charAt(i)) { case '/': case '#': case '?': // No protocol. // Check for domain relative URLs like //www.evil.org/ if (s.startsWith("//") // or the protocols by which HTML is normally served are OK. && !allowProtocolRelativeUrls()) { return null; } break protocol_loop; case ':': if (!protocols.contains(s.substring(i))) { return null; } break protocol_loop; } } return normalizeUri(s); } protected boolean allowProtocolRelativeUrls() { return protocols.contains("http") && protocols.contains("https"); } /** Percent encodes anything that looks like a colon, or a parenthesis. */ static String normalizeUri(String s) { int n = s.length(); boolean colonsIrrelevant = false; for (int i = 0; i < n; ++i) { char ch = s.charAt(i); switch (ch) { case '/': case '#': case '?': case ':': colonsIrrelevant = true; break; case '(': case ')': case '\uff1a': StringBuilder sb = new StringBuilder(n + 16); int pos = 0; for (; i < n; ++i) { ch = s.charAt(i); switch (ch) { case '(': sb.append(s, pos, i).append("%28"); pos = i + 1; break; case ')': sb.append(s, pos, i).append("%29"); pos = i + 1; break; case '\uff1a': // Full-width colon. if (!colonsIrrelevant) { // TODO: do we need to encode non-colon characters if we're // not dealing with URLs that haven't been copy/pasted into // the URL bar? // Is it safe to assume UTF-8 here? sb.append(s, pos, i).append("%ef%bc%9a"); pos = i + 1; } break; } } return sb.append(s, pos, n).toString(); } } return s; } }owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/CssGrammar.java0000664000175000017500000002343311654053470027717 0ustar jamespagejamespage// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.regex.Matcher; import java.util.regex.Pattern; class CssGrammar { /** * Lexical grammar for CSS tokens converted from * http://www.w3.org/TR/CSS2/grammar.html */ private static final Pattern CSS_TOKEN; static { // nl \n|\r\n|\r|\f ; a newline //String nl = "\n|\r\n|\r|\f"; // h [0-9a-f] ; a hexadecimal digit String h = "[0-9a-f]"; // nonascii [\200-\377] String nonascii = "[" + ((char) 0200) + "-" + ((char) 0377) + "]"; // unicode \\{h}{1,6}(\r\n|[ \t\r\n\f])? String unicode = "(?:(?:\\\\" + h + "{1,6})(?:\r\n|[ \t\r\n\f])?)"; // escape {unicode}|\\[^\r\n\f0-9a-f] String escape = "(?:" + unicode + "|\\\\[^\r\n\f0-9a-f])"; // nmstart [_a-z]|{nonascii}|{escape} String nmstart = "(?:[_a-z]|" + nonascii + "|" + escape + ")"; // nmchar [_a-z0-9-]|{nonascii}|{escape} String nmchar = "(?:[_a-z0-9-]|" + nonascii + "|" + escape + ")"; // ident -?{nmstart}{nmchar}* String ident = "-?" + nmstart + nmchar + "*"; // name {nmchar}+ String name = nmchar + "+"; // hash String hash = "#" + name; // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" ; "string" String string1 = "\"(?:[^\n\r\f\"\\\\]|\\\\.)*\""; // string2 \'([^\n\r\f\\']|\\{nl}|{escape})*\' ; 'string' String string2 = "'(?:[^\n\r\f\'\\\\]|\\\\.)*'"; // string {string1}|{string2} String string = "(?:" + string1 + "|" + string2 + ")"; // num [0-9]+|[0-9]*"."[0-9]+ String num = "(?:[0-9]*\\.[0-9]+|[0-9]+)"; // s [ \t\r\n\f] String s = "[ \t\r\n\f]"; // w {s}* String w = "(?:" + s + "*)"; // url special chars String url_special_chars = "[!#$%&*-~]"; // url chars ({url_special_chars}|{nonascii}|{escape})* String URL_CHARS = "(?:" + url_special_chars + "|" + nonascii + "|" + escape + ")*"; // url String url = "url\\(" + w + "(" + string + "|" + URL_CHARS + ")" + w + "\\)"; // comments // see http://www.w3.org/TR/CSS21/grammar.html String comment = "/\\*(?:\\**[^*])*\\*+/"; // {E}{M} {return EMS;} // {E}{X} {return EXS;} // {P}{X} {return LENGTH;} // {C}{M} {return LENGTH;} // {M}{M} {return LENGTH;} // {I}{N} {return LENGTH;} // {P}{T} {return LENGTH;} // {P}{C} {return LENGTH;} // {D}{E}{G} {return ANGLE;} // {R}{A}{D} {return ANGLE;} // {G}{R}{A}{D} {return ANGLE;} // {M}{S} {return TIME;} // {S} {return TIME;} // {H}{Z} {return FREQ;} // {K}{H}{Z} {return FREQ;} // % {return PERCENTAGE;} String unit = "(?:em|ex|px|cm|mm|in|pt|pc|deg|rad|grad|ms|s|hz|khz|%)"; // {num}{UNIT|IDENT} {return NUMBER;} String quantity = num + w + "(?:" + unit + "|" + ident + ")?"; // "" {return CDC;} // "~=" {return INCLUDES;} // "|=" {return DASHMATCH;} // {w}"{" {return LBRACE;} // {w}"+" {return PLUS;} // {w}">" {return GREATER;} // {w}"," {return COMMA;} // Extra punctuation: brackets, dots, slash. String punc = "|~=|\\|=|[\\{\\}\\+>,:;()\\[\\]\\./]"; CSS_TOKEN = Pattern.compile( // Identifier, keyword, or hash in group 1, "((?!url\\b)" + ident + "|" + hash + ")" + "|([+-]?" + quantity + ")" // A quantity in group 2, // A comment in group 0. + "|" + comment // A string, URL, or punctuation in group 3, + "|(" + string + "|" + url + "|" + punc + ")" // or a whitespace in group 0. + "|(?:" + s + "+)|", Pattern.CASE_INSENSITIVE); } /** * Creates a matcher that will match tokens in the CSS in order. * The matcher will have the token in group 0. If the token is an identifier, * keyword, or hash token (color or HTML ID) then it group 1 will be present. * If the token is a quantity, group 2 will be present. * If the token is a string, url, or punctuation, group 3 will be present. */ static Matcher lex(String css) { return CSS_TOKEN.matcher(css); } static void asPropertyGroup(String css, PropertyHandler handler) { // Split tokens by semicolons/curly-braces, then by first colon, // dropping spaces and comments to identify property names and token runs // that form the value. Matcher m = lex(css); propertyNameLoop: while (m.find()) { // Check that we have an identifier that might be a property name. if (m.start(1) < 0 || css.charAt(m.start(1)) == '#') { continue; } String name = m.group(0); // Look for a colon. while (m.find()) { if (m.start(1) >= 0) { if (css.charAt(m.start(1)) == '#') { continue propertyNameLoop; } name = m.group(0); } else if (m.start(2) >= 0) { continue propertyNameLoop; } else if (m.start(3) + 1 == m.end(3)) { if (':' == css.charAt(m.start(3))) { break; } else { continue propertyNameLoop; } } } handler.startProperty(Strings.toLowerCase(cssContent(name))); propertyValueLoop: while (m.find()) { if (m.start(1) >= 0) { handler.identifierOrHash(m.group()); } else if (m.start(2) >= 0) { handler.quantity(m.group()); } else if (m.start(3) >= 0) { String token = m.group(0); switch (token.charAt(0)) { case '"': case '\'': handler.quotedString(token); break; case 'u': case 'U': handler.url(token); break; case ';': case '{': case '}': case ':': break propertyValueLoop; default: handler.punctuation(token); } } } handler.endProperty(); } } /** * Decodes any escape sequences and strips any quotes from the input. */ static String cssContent(String token) { int n = token.length(); int pos = 0; StringBuilder sb = null; if (n >= 2) { char ch0 = token.charAt(0); if (ch0 == '"' || ch0 == '\'') { if (ch0 == token.charAt(n - 1)) { pos = 1; --n; sb = new StringBuilder(n); } } } for (int esc; (esc = token.indexOf('\\', pos)) >= 0;) { int end = esc + 2; if (esc > n) { break; } if (sb == null) { sb = new StringBuilder(n); } sb.append(token, pos, esc); int codepoint = token.charAt(end - 1); if (isHex(codepoint)) { // Parse \hhhhh where hhhhh is one or more hex digits // and is an optional space or tab character that can be // used to separate an escape sequence from a following literal hex // digit. while (end < n && isHex(token.charAt(end))) { ++end; } try { codepoint = Integer.parseInt(token.substring(esc + 1, end), 16); } catch (RuntimeException ex) { codepoint = 0xfffd; // Unknown codepoint. } if (end < n) { char ch = token.charAt(end); if (ch == ' ' || ch == '\t') { // Ignorable hex follower. ++end; } } } sb.appendCodePoint(codepoint); pos = end; } if (sb == null) { return token; } return sb.append(token, pos, n).toString(); } private static boolean isHex(int codepoint) { return ('0' <= codepoint && codepoint <= '9') || ('A' <= codepoint && codepoint <= 'F') || ('a' <= codepoint && codepoint <= 'f'); } interface PropertyHandler { void startProperty(String propertyName); void quantity(String token); void identifierOrHash(String token); void quotedString(String token); void url(String token); void punctuation(String token); void endProperty(); } } owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/HtmlLexer.java0000664000175000017500000006131411654053470027564 0ustar jamespagejamespage// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import java.util.LinkedList; import java.util.NoSuchElementException; import java.util.Set; import javax.annotation.concurrent.NotThreadSafe; /** * A flexible lexer for HTML. * This is hairy code, but it is outside the TCB for the HTML sanitizer. * * @author Mike Samuel */ @NotThreadSafe final class HtmlLexer extends AbstractTokenStream { private final String input; private final HtmlInputSplitter splitter; private State state = State.OUTSIDE_TAG; public HtmlLexer(String input) { this.input = input; this.splitter = new HtmlInputSplitter(input); } /** * Normalize case of names that are not name-spaced. This lower-cases HTML * element and attribute names, but not ones for embedded SVG or MATHML. */ static String canonicalName(String elementOrAttribName) { return elementOrAttribName.indexOf(':') >= 0 ? elementOrAttribName : Strings.toLowerCase(elementOrAttribName); } /** * An FSM that lets us reclassify text tokens inside tags as attribute * names/values */ private static enum State { OUTSIDE_TAG, IN_TAG, SAW_NAME, SAW_EQ, ; } /** * Makes sure that this.token contains a token if one is available. * This may require fetching and combining multiple tokens from the underlying * splitter. */ @Override protected HtmlToken produce() { HtmlToken token = readToken(); if (token == null) { return null; } switch (token.type) { // Keep track of whether we're inside a tag or not. case TAGBEGIN: state = State.IN_TAG; break; case TAGEND: if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) { // Distinguish from // pushbackToken(token); state = State.IN_TAG; return HtmlToken.instance( token.start, token.start, HtmlTokenType.ATTRVALUE); } state = State.OUTSIDE_TAG; break; // Drop ignorable tokens by zeroing out the one received and recursing case IGNORABLE: return produce(); // collapse adjacent text nodes if we're outside a tag, or otherwise, // Recognize attribute names and values. default: switch (state) { case OUTSIDE_TAG: if (HtmlTokenType.TEXT == token.type || HtmlTokenType.UNESCAPED == token.type) { token = collapseSubsequent(token); } break; case IN_TAG: if (HtmlTokenType.TEXT == token.type && !token.tokenInContextMatches(input, "=")) { // Reclassify as attribute name token = HtmlInputSplitter.reclassify( token, HtmlTokenType.ATTRNAME); state = State.SAW_NAME; } break; case SAW_NAME: if (HtmlTokenType.TEXT == token.type) { if (token.tokenInContextMatches(input, "=")) { state = State.SAW_EQ; // Skip the '=' token return produce(); } else { // Reclassify as attribute name token = HtmlInputSplitter.reclassify( token, HtmlTokenType.ATTRNAME); } } else { state = State.IN_TAG; } break; case SAW_EQ: if (HtmlTokenType.TEXT == token.type || HtmlTokenType.QSTRING == token.type) { if (HtmlTokenType.TEXT == token.type) { // Collapse adjacent text nodes to properly handle // // token = collapseAttributeName(token); } // Reclassify as value token = HtmlInputSplitter.reclassify( token, HtmlTokenType.ATTRVALUE); state = State.IN_TAG; } break; } break; } return token; } /** * Collapses all the following tokens of the same type into this.token. */ private HtmlToken collapseSubsequent(HtmlToken token) { HtmlToken collapsed = token; for (HtmlToken next; (next= peekToken(0)) != null && next.type == token.type; readToken()) { collapsed = join(collapsed, next); } return collapsed; } private HtmlToken collapseAttributeName(HtmlToken token) { // We want to collapse tokens into the value that are not parts of an // attribute value. We should include any space or text adjacent to the // value, but should stop at any of the following constructions: // space end-of-file e.g. name=foo_ // space valueless-attrib-name e.g. name=foo checked // space tag-end e.g. name=foo /> // space text space? '=' e.g. name=foo bar= int nToMerge = 0; for (HtmlToken t; (t = peekToken(nToMerge)) != null;) { if (t.type == HtmlTokenType.IGNORABLE) { HtmlToken tok = peekToken(nToMerge + 1); if (tok == null) { break; } if (tok.type != HtmlTokenType.TEXT) { break; } if (isValuelessAttribute(input.substring(tok.start, tok.end))) { break; } HtmlToken eq = peekToken(nToMerge + 2); if (eq != null && eq.type == HtmlTokenType.IGNORABLE) { eq = peekToken(nToMerge + 3); } if (eq == null || eq.tokenInContextMatches(input, "=")) { break; } } else if (t.type != HtmlTokenType.TEXT) { break; } ++nToMerge; } if (nToMerge == 0) { return token; } int end = token.end; do { end = readToken().end; } while (--nToMerge > 0); return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT); } private static HtmlToken join(HtmlToken a, HtmlToken b) { return HtmlToken.instance(a.start, b.end, a.type); } private final LinkedList lookahead = Lists.newLinkedList(); private HtmlToken readToken() { if (!lookahead.isEmpty()) { return lookahead.remove(); } else if (splitter.hasNext()) { return splitter.next(); } else { return null; } } private HtmlToken peekToken(int i) { while (lookahead.size() <= i && splitter.hasNext()) { lookahead.add(splitter.next()); } return lookahead.size() > i ? lookahead.get(i) : null; } private void pushbackToken(HtmlToken token) { lookahead.addFirst(token); } /** Can the attribute appear in HTML without a value. */ private static boolean isValuelessAttribute(String attribName) { boolean valueless = VALUELESS_ATTRIB_NAMES.contains( Strings.toLowerCase(attribName)); return valueless; } // From http://issues.apache.org/jira/browse/XALANC-519 private static final Set VALUELESS_ATTRIB_NAMES = ImmutableSet.of( "checked", "compact", "declare", "defer", "disabled", "ismap", "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", "selected"); } /** * A token stream that breaks a character stream into * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE} * tokens. The matching of attribute names and values is done in a later step. */ final class HtmlInputSplitter extends AbstractTokenStream { /** The source of HTML character data. */ private final String input; /** An offset into input. */ private int offset; /** True iff the current character is inside a tag. */ private boolean inTag; /** * True if inside a script, xmp, listing, or similar tag whose content does * not follow the normal escaping rules. */ private boolean inEscapeExemptBlock; /** * Null or the name of the close tag required to end the current escape exempt * block. * Preformatted tags include <script>, <xmp>, etc. that may * contain unescaped HTML input. */ private String escapeExemptTagName = null; private HtmlTextEscapingMode textEscapingMode; public HtmlInputSplitter(String input) { this.input = input; } /** * Make sure that there is a token ready to yield in this.token. */ @Override protected HtmlToken produce() { HtmlToken token = parseToken(); if (null == token) { return null; } // Handle escape-exempt blocks. // The parse() method is only dimly aware of escape-excempt blocks, so // here we detect the beginning and ends of escape exempt blocks, and // reclassify as UNESCAPED, any tokens that appear in the middle. if (inEscapeExemptBlock) { if (token.type != HtmlTokenType.SERVERCODE) { // classify RCDATA as text since it can contain entities token = reclassify( token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA ? HtmlTokenType.TEXT : HtmlTokenType.UNESCAPED)); } } else { switch (token.type) { case TAGBEGIN: { String canonTagName = canonicalName( token.start + 1, token.end); if (HtmlTextEscapingMode.isTagFollowedByLiteralContent( canonTagName)) { this.escapeExemptTagName = canonTagName; this.textEscapingMode = HtmlTextEscapingMode.getModeForTag( canonTagName); } break; } case TAGEND: this.inEscapeExemptBlock = null != this.escapeExemptTagName; break; default: break; } } return token; } /** * States for a state machine for optimistically identifying tags and other * html/xml/phpish structures. */ private static enum State { TAGNAME, SLASH, BANG, BANG_DASH, COMMENT, COMMENT_DASH, COMMENT_DASH_DASH, DIRECTIVE, DONE, APP_DIRECTIVE, APP_DIRECTIVE_QMARK, SERVER_CODE, SERVER_CODE_PCT, // From HTML 5 section 8.1.2.6 // The text in CDATA and RCDATA elements must not contain any // occurrences of the string "), or U+002F SOLIDUS (/), unless // that string is part of an escaping text span. // An escaping text span is a span of text (in CDATA and RCDATA // elements) and character entity references (in RCDATA elements) // that starts with an escaping text span start that is not itself // in an escaping text span, and ends at the next escaping text // span end. // An escaping text span start is a part of text that consists of // the four character sequence "". // An escaping text span start may share its U+002D HYPHEN-MINUS characters // with its corresponding escaping text span end. UNESCAPED_LT_BANG, // This property is useful as it allows fetch to collapse and reclassify * ranges of tokens based on state that is easy to maintain there. * *

Later passes are responsible for throwing away useless tokens. */ private HtmlToken parseToken() { int start = offset; int limit = input.length(); if (start == limit) { return null; } int end = start + 1; HtmlTokenType type; char ch = input.charAt(start); if (inTag) { if ('>' == ch) { type = HtmlTokenType.TAGEND; inTag = false; } else if ('/' == ch) { if (end != limit && '>' == input.charAt(end)) { type = HtmlTokenType.TAGEND; inTag = false; ++end; } else { type = HtmlTokenType.TEXT; } } else if ('=' == ch) { type = HtmlTokenType.TEXT; } else if ('"' == ch || '\'' == ch) { type = HtmlTokenType.QSTRING; int delim = ch; for (; end < limit; ++end) { if (input.charAt(end) == delim) { ++end; break; } } } else if (!Character.isWhitespace(ch)) { type = HtmlTokenType.TEXT; for (; end < limit; ++end) { ch = input.charAt(end); // End a text chunk before /> if ((lastNonIgnorable == null || !lastNonIgnorable.tokenInContextMatches(input, "=")) && '/' == ch && end + 1 < limit && '>' == input.charAt(end + 1)) { break; } else if ('>' == ch || '=' == ch || Character.isWhitespace(ch)) { break; } else if ('"' == ch || '\'' == ch) { if (end + 1 < limit) { char ch2 = input.charAt(end + 1); if (ch2 >= 0 && Character.isWhitespace(ch2) || ch2 == '>' || ch2 == '/') { ++end; break; } } } } } else { // We skip whitespace tokens inside tag bodies. type = HtmlTokenType.IGNORABLE; while (end < limit && Character.isWhitespace(input.charAt(end))) { ++end; } } } else { if (ch == '<') { if (end == limit) { type = HtmlTokenType.TEXT; } else { ch = input.charAt(end); type = null; State state = null; switch (ch) { case '/': // close tag? state = State.SLASH; ++end; break; case '!': // Comment or declaration if (!this.inEscapeExemptBlock) { state = State.BANG; } else if (HtmlTextEscapingMode.allowsEscapingTextSpan( escapeExemptTagName)) { // Directives, and cdata suppressed in escape // exempt mode as they could obscure the close of the // escape exempty block, but comments are similar to escaping // text spans, and are significant in all CDATA and RCDATA // blocks except those inside

tags. // See "Escaping text spans" in section 8.1.2.6 of HTML5. // http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions state = State.UNESCAPED_LT_BANG; } ++end; break; case '?': if (!this.inEscapeExemptBlock) { state = State.APP_DIRECTIVE; } ++end; break; case '%': state = State.SERVER_CODE; ++end; break; default: if (isIdentStart(ch) && !this.inEscapeExemptBlock) { state = State.TAGNAME; ++end; } else if ('<' == ch) { type = HtmlTokenType.TEXT; } else { ++end; } break; } if (null != state) { charloop: while (end < limit) { ch = input.charAt(end); switch (state) { case TAGNAME: if (Character.isWhitespace(ch) || '>' == ch || '/' == ch || '<' == ch) { // End processing of an escape exempt block when we see // a corresponding end tag. if (this.inEscapeExemptBlock && '/' == input.charAt(start + 1) && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT && canonicalName(start + 2, end) .equals(escapeExemptTagName)) { this.inEscapeExemptBlock = false; this.escapeExemptTagName = null; this.textEscapingMode = null; } type = HtmlTokenType.TAGBEGIN; // Don't process content as attributes if we're inside // an escape exempt block. inTag = !this.inEscapeExemptBlock; state = State.DONE; break charloop; } break; case SLASH: if (Character.isLetter(ch)) { state = State.TAGNAME; } else { if ('<' == ch) { type = HtmlTokenType.TEXT; } else { ++end; } break charloop; } break; case BANG: if ('-' == ch) { state = State.BANG_DASH; } else { state = State.DIRECTIVE; } break; case BANG_DASH: if ('-' == ch) { state = State.COMMENT; } else { state = State.DIRECTIVE; } break; case COMMENT: if ('-' == ch) { state = State.COMMENT_DASH; } break; case COMMENT_DASH: state = ('-' == ch) ? State.COMMENT_DASH_DASH : State.COMMENT_DASH; break; case COMMENT_DASH_DASH: if ('>' == ch) { state = State.DONE; type = HtmlTokenType.COMMENT; } else if ('-' == ch) { state = State.COMMENT_DASH_DASH; } else { state = State.COMMENT_DASH; } break; case DIRECTIVE: if ('>' == ch) { type = HtmlTokenType.DIRECTIVE; state = State.DONE; } break; case APP_DIRECTIVE: if ('?' == ch) { state = State.APP_DIRECTIVE_QMARK; } break; case APP_DIRECTIVE_QMARK: if ('>' == ch) { type = HtmlTokenType.DIRECTIVE; state = State.DONE; } else if ('?' != ch) { state = State.APP_DIRECTIVE; } break; case SERVER_CODE: if ('%' == ch) { state = State.SERVER_CODE_PCT; } break; case SERVER_CODE_PCT: if ('>' == ch) { type = HtmlTokenType.SERVERCODE; state = State.DONE; } else if ('%' != ch) { state = State.SERVER_CODE; } break; case UNESCAPED_LT_BANG: if ('-' == ch) { state = State.UNESCAPED_LT_BANG_DASH; } else { type = HtmlTokenType.TEXT; state = State.DONE; } break; case UNESCAPED_LT_BANG_DASH: if ('-' == ch) { // According to HTML 5 section 8.1.2.6 // An escaping text span start may share its // U+002D HYPHEN-MINUS characters with its // corresponding escaping text span end. state = State.ESCAPING_TEXT_SPAN_DASH_DASH; } else { type = HtmlTokenType.TEXT; state = State.DONE; } break; case ESCAPING_TEXT_SPAN: if ('-' == ch) { state = State.ESCAPING_TEXT_SPAN_DASH; } break; case ESCAPING_TEXT_SPAN_DASH: if ('-' == ch) { state = State.ESCAPING_TEXT_SPAN_DASH_DASH; } else { state = State.ESCAPING_TEXT_SPAN; } break; case ESCAPING_TEXT_SPAN_DASH_DASH: if ('>' == ch) { type = HtmlTokenType.TEXT; state = State.DONE; } else if ('-' != ch) { state = State.ESCAPING_TEXT_SPAN; } break; case DONE: throw new AssertionError( "Unexpectedly DONE while lexing HTML token stream"); } ++end; if (State.DONE == state) { break; } } if (end == limit) { switch (state) { case DONE: break; case COMMENT: case COMMENT_DASH: case COMMENT_DASH_DASH: type = HtmlTokenType.COMMENT; break; case DIRECTIVE: case APP_DIRECTIVE: case APP_DIRECTIVE_QMARK: type = HtmlTokenType.DIRECTIVE; break; case SERVER_CODE: case SERVER_CODE_PCT: type = HtmlTokenType.SERVERCODE; break; case TAGNAME: type = HtmlTokenType.TAGBEGIN; break; default: type = HtmlTokenType.TEXT; break; } } } } } else { type = null; } } if (null == type) { while (end < limit && '<' != input.charAt(end)) { ++end; } type = HtmlTokenType.TEXT; } offset = end; HtmlToken result = HtmlToken.instance(start, end, type); if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; } return result; } private String canonicalName(int start, int end) { return HtmlLexer.canonicalName(input.substring(start, end)); } private boolean isIdentStart(char ch) { return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a'); } static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) { return HtmlToken.instance(token.start, token.end, type); } } /** * A TokenStream that lazily fetches one token at a time. * * @author Mike Samuel <mikesamuel@gmail.com> */ abstract class AbstractTokenStream implements TokenStream { private HtmlToken tok; public final boolean hasNext() { if (tok == null) { tok = produce(); } return tok != null; } public HtmlToken next() { if (this.tok == null) { this.tok = produce(); } HtmlToken t = this.tok; if (t == null) { throw new NoSuchElementException(); } this.tok = null; return t; } protected abstract HtmlToken produce(); } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/TokenStream.java���������������������0000664�0001750�0001750�00000003137�11654053470�030113� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; interface TokenStream { HtmlToken next(); boolean hasNext(); } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/HtmlToken.java�����������������������0000664�0001750�0001750�00000004157�11654053470�027567� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import javax.annotation.concurrent.Immutable; @Immutable final class HtmlToken { final int start; final int end; final HtmlTokenType type; static HtmlToken instance(int start, int end, HtmlTokenType type) { return new HtmlToken(start, end, type); } boolean tokenInContextMatches(String context, String match) { int n = end - start; if (n != match.length()) { return false; } return context.regionMatches(start, match, 0, n); } private HtmlToken(int start, int end, HtmlTokenType type) { this.start = start; this.end = end; this.type = type; } } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/HtmlTextEscapingMode.java������������0000664�0001750�0001750�00000014046�11654053470�031710� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import com.google.common.collect.ImmutableMap; /** * From section 8.1.2.6 of http://www.whatwg.org/specs/web-apps/current-work/ * <p> * The text in CDATA and RCDATA elements must not contain any * occurrences of the string "</" (U+003C LESS-THAN SIGN, U+002F * SOLIDUS) followed by characters that case-insensitively match the * tag name of the element followed by one of U+0009 CHARACTER * TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C * FORM FEED (FF), U+0020 SPACE, U+003E GREATER-THAN SIGN (>), or * U+002F SOLIDUS (/), unless that string is part of an escaping * text span. * </p> * * <p> * See also * http://www.whatwg.org/specs/web-apps/current-work/#cdata-rcdata-restrictions * for the elements which fall in each category. * </p> * * @author Mike Samuel <mikesamuel@gmail.com> */ enum HtmlTextEscapingMode { /** * Normally escaped character data that breaks around comments and tags. */ PCDATA, /** * A span of text where HTML special characters are interpreted literally, * as in a SCRIPT tag. */ CDATA, /** * Like {@link #CDATA} but only for certain browsers. */ CDATA_SOMETIMES, /** * A span of text and character entity references where HTML special * characters are interpreted literally, as in a TITLE tag. */ RCDATA, /** * A span of text where HTML special characters are interpreted literally, * where there is no end tag. PLAIN_TEXT runs until the end of the file. */ PLAIN_TEXT, /** * Cannot contain data. */ VOID, ; private static final ImmutableMap<String, HtmlTextEscapingMode> ESCAPING_MODES = ImmutableMap.<String, HtmlTextEscapingMode>builder() .put("iframe", CDATA) // HTML5 does not treat listing as CDATA and treats XMP as deprecated, // but HTML2 does at // http://www.w3.org/MarkUp/1995-archive/NonStandard.html // Listing is not supported by browsers. .put("listing", CDATA_SOMETIMES) .put("xmp", CDATA) // Technically, noembed, noscript and noframes are CDATA_SOMETIMES but // we can only be hurt by allowing tag content that looks like text so // we treat them as regular.. //.put("noembed", CDATA_SOMETIMES) //.put("noframes", CDATA_SOMETIMES) //.put("noscript", CDATA_SOMETIMES) .put("comment", CDATA_SOMETIMES) // IE only // Runs till end of file. .put("plaintext", PLAIN_TEXT) .put("script", CDATA) .put("style", CDATA) // Textarea and Title are RCDATA, not CDATA, so decode entity references. .put("textarea", RCDATA) .put("title", RCDATA) // Nodes that can't contain content. // http://www.w3.org/TR/html-markup/syntax.html#void-elements .put("area", VOID) .put("base", VOID) .put("br", VOID) .put("col", VOID) .put("command", VOID) .put("embed", VOID) .put("hr", VOID) .put("img", VOID) .put("input", VOID) .put("keygen", VOID) .put("link", VOID) .put("meta", VOID) .put("param", VOID) .put("source", VOID) .put("track", VOID) .put("wbr", VOID) .build(); /** * The mode used for content following a start tag with the given name. */ public static HtmlTextEscapingMode getModeForTag(String canonTagName) { HtmlTextEscapingMode mode = ESCAPING_MODES.get(canonTagName); return mode != null ? mode : PCDATA; } /** * True iff the content following the given tag allows escaping text * spans: {@code <!--&hellip;-->} that escape even things that might * be an end tag for the corresponding open tag. */ public static boolean allowsEscapingTextSpan(String canonTagName) { // <xmp> and <plaintext> do not admit escaping text spans. return "style".equals(canonTagName) || "script".equals(canonTagName) || "noembed".equals(canonTagName) || "noscript".equals(canonTagName) || "noframes".equals(canonTagName); } /** * True if content immediately following the start tag must be treated as * special CDATA so that &lt;'s are not treated as starting tags, comments * or directives. */ public static boolean isTagFollowedByLiteralContent(String canonTagName) { HtmlTextEscapingMode mode = getModeForTag(canonTagName); return mode != PCDATA && mode != VOID; } /** * True iff the tag cannot contain any content -- will an HTML parser consider * the element to have ended immediately after the start tag. */ public static boolean isVoidElement(String canonTagName) { return getModeForTag(canonTagName) == VOID; } } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000150�00000000000�011561� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/TagBalancingHtmlStreamEventReceiver.java�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/TagBalancingHtmlStreamEventReceiver.j0000664�0001750�0001750�00000066245�11654053470�034202� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.List; import javax.annotation.concurrent.Immutable; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; /** * Wraps an HTML stream event receiver to fill in missing close tags. * If the balancer is given the HTML {@code <p>1<p>2}, the wrapped receiver will * see events equivalent to {@code <p>1</p><p>2</p>}. * * @author Mike Samuel <mikesamuel@gmail.com> */ @TCB public class TagBalancingHtmlStreamEventReceiver implements HtmlStreamEventReceiver { private final HtmlStreamEventReceiver underlying; private int nestingLimit = Integer.MAX_VALUE; private final List<ElementContainmentInfo> openElements = Lists.newArrayList(); public TagBalancingHtmlStreamEventReceiver( HtmlStreamEventReceiver underlying) { this.underlying = underlying; } public void setNestingLimit(int limit) { if (openElements.size() > limit) { throw new IllegalStateException(); } this.nestingLimit = limit; } public void openDocument() { underlying.openDocument(); } public void closeDocument() { for (int i = Math.min(nestingLimit, openElements.size()); --i >= 0;) { underlying.closeTag(openElements.get(i).elementName); } openElements.clear(); underlying.closeDocument(); } public void openTag(String elementName, List<String> attrs) { String canonElementName = HtmlLexer.canonicalName(elementName); ElementContainmentInfo elInfo = ELEMENT_CONTAINMENT_RELATIONSHIPS.get( canonElementName); // Treat unrecognized tags as void, but emit closing tags in closeTag(). if (elInfo == null) { if (openElements.size() < nestingLimit) { underlying.openTag(elementName, attrs); } return; } // Close all the elements that cannot contain the element to open. List<ElementContainmentInfo> toResumeInReverse = null; for (int i = openElements.size(); --i >= 0;) { ElementContainmentInfo top = openElements.get(i); if ((top.contents & elInfo.types) != 0) { break; } if (openElements.size() < nestingLimit) { underlying.closeTag(top.elementName); } openElements.remove(i); if (top.resumable) { if (toResumeInReverse == null) { toResumeInReverse = Lists.newArrayList(); } toResumeInReverse.add(top); } } if (toResumeInReverse != null) { resume(toResumeInReverse); } if (openElements.size() < nestingLimit) { underlying.openTag(elementName, attrs); } if (!elInfo.isVoid) { openElements.add(elInfo); } } public void closeTag(String elementName) { String canonElementName = HtmlLexer.canonicalName(elementName); ElementContainmentInfo elInfo = ELEMENT_CONTAINMENT_RELATIONSHIPS.get( canonElementName); if (elInfo == null) { // Allow unrecognized end tags through. if (openElements.size() < nestingLimit) { underlying.closeTag(elementName); } return; } int index = openElements.lastIndexOf(elInfo); if (index < 0) { return; } // Don't close unopened tags. int last = openElements.size(); // Close all the elements that cannot contain the element to open. List<ElementContainmentInfo> toResumeInReverse = null; while (--last > index) { ElementContainmentInfo unclosed = openElements.remove(last); if (last + 1 < nestingLimit) { underlying.closeTag(unclosed.elementName); } if (unclosed.resumable) { if (toResumeInReverse == null) { toResumeInReverse = Lists.newArrayList(); } toResumeInReverse.add(unclosed); } } if (openElements.size() < nestingLimit) { underlying.closeTag(elementName); } openElements.remove(index); if (toResumeInReverse != null) { resume(toResumeInReverse); } } private void resume(List<ElementContainmentInfo> toResumeInReverse) { for (ElementContainmentInfo toResume : toResumeInReverse) { // TODO: If resuming of things other than plain formatting tags like <b> // and <i>, then we need to store the attributes for resumable tags so // that we can resume with the appropriate attributes. if (openElements.size() < nestingLimit) { underlying.openTag(toResume.elementName, Lists.<String>newArrayList()); } openElements.add(toResume); } } public void text(String text) { if (openElements.size() < nestingLimit) { underlying.text(text); } } @Immutable static final class ElementContainmentInfo { final String elementName; /** * True if the adoption agency algorithm allows an element to be resumed * after a mis-nested end tag closes it. * E.g. in {@code <b>Foo<i>Bar</b>Baz</i>} the {@code <i>} element is * resumed after the {@code <b>} element is closed. */ final boolean resumable; /** A set of bits of element groups into which the element falls. */ final int types; /** The type of elements that an element can contain. */ final int contents; /** True if the element has no content -- not even text content. */ final boolean isVoid; ElementContainmentInfo( String elementName, boolean resumable, int types, int contents) { this.elementName = elementName; this.resumable = resumable; this.types = types; this.contents = contents; this.isVoid = contents == 0 && HtmlTextEscapingMode.isVoidElement(elementName); } @Override public String toString() { return "<" + elementName + ">"; } } ImmutableMap<String, ElementContainmentInfo> ELEMENT_CONTAINMENT_RELATIONSHIPS = new ElementContainmentRelationships().toMap(); private static class ElementContainmentRelationships { private enum ElementGroup { BLOCK, INLINE, INLINE_MINUS_A, MIXED, TABLE_CONTENT, HEAD_CONTENT, TOP_CONTENT, AREA_ELEMENT, FORM_ELEMENT, LEGEND_ELEMENT, LI_ELEMENT, DL_PART, P_ELEMENT, OPTIONS_ELEMENT, OPTION_ELEMENT, PARAM_ELEMENT, TABLE_ELEMENT, TR_ELEMENT, TD_ELEMENT, COL_ELEMENT, ; } private static int elementGroupBits(ElementGroup a) { return 1 << a.ordinal(); } private static int elementGroupBits( ElementGroup a, ElementGroup b) { return (1 << a.ordinal()) | (1 << b.ordinal()); } private static int elementGroupBits( ElementGroup a, ElementGroup b, ElementGroup c) { return (1 << a.ordinal()) | (1 << b.ordinal()) | (1 << c.ordinal()); } private static int elementGroupBits( ElementGroup... bits) { int bitField = 0; for (ElementGroup bit : bits) { bitField |= (1 << bit.ordinal()); } return bitField; } private ImmutableMap.Builder<String, ElementContainmentInfo> definitions = ImmutableMap.builder(); private void defineElement( String elementName, boolean resumable, int types, int contentTypes) { definitions.put(elementName, new ElementContainmentInfo( elementName, resumable, types, contentTypes)); } private ImmutableMap<String, ElementContainmentInfo> toMap() { return definitions.build(); } { defineElement( "a", false, elementGroupBits( ElementGroup.INLINE ), elementGroupBits( ElementGroup.INLINE_MINUS_A )); defineElement( "abbr", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "acronym", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "address", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.INLINE, ElementGroup.P_ELEMENT )); defineElement( "applet", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE, ElementGroup.PARAM_ELEMENT )); defineElement( "area", false, elementGroupBits(ElementGroup.AREA_ELEMENT), 0); defineElement( "audio", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), 0); defineElement( "b", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "base", false, elementGroupBits(ElementGroup.HEAD_CONTENT), 0); defineElement( "basefont", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), 0); defineElement( "bdi", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "bdo", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "big", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "blink", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "blockquote", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "body", false, elementGroupBits( ElementGroup.TOP_CONTENT ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "br", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), 0); defineElement( "button", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "canvas", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "caption", false, elementGroupBits( ElementGroup.TABLE_CONTENT ), elementGroupBits( ElementGroup.INLINE )); defineElement( "center", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "cite", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "code", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "col", false, elementGroupBits( ElementGroup.TABLE_CONTENT, ElementGroup.COL_ELEMENT ), 0); defineElement( "colgroup", false, elementGroupBits( ElementGroup.TABLE_CONTENT ), elementGroupBits( ElementGroup.COL_ELEMENT )); defineElement( "dd", false, elementGroupBits( ElementGroup.DL_PART ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "del", true, elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE, ElementGroup.MIXED ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "dfn", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "dir", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.LI_ELEMENT )); defineElement( "div", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "dl", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.DL_PART )); defineElement( "dt", false, elementGroupBits( ElementGroup.DL_PART ), elementGroupBits( ElementGroup.INLINE )); defineElement( "em", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "fieldset", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE, ElementGroup.LEGEND_ELEMENT )); defineElement( "font", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "form", false, elementGroupBits( ElementGroup.BLOCK, ElementGroup.FORM_ELEMENT ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A, ElementGroup.TR_ELEMENT, ElementGroup.TD_ELEMENT )); defineElement( "h1", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.INLINE )); defineElement( "h2", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.INLINE )); defineElement( "h3", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.INLINE )); defineElement( "h4", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.INLINE )); defineElement( "h5", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.INLINE )); defineElement( "h6", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.INLINE )); defineElement( "head", false, elementGroupBits( ElementGroup.TOP_CONTENT ), elementGroupBits( ElementGroup.HEAD_CONTENT )); defineElement( "hr", false, elementGroupBits(ElementGroup.BLOCK), 0); defineElement( "html", false, 0, elementGroupBits(ElementGroup.TOP_CONTENT)); defineElement( "i", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "iframe", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "img", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), 0); defineElement( "input", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), 0); defineElement( "ins", true, elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "isindex", false, elementGroupBits(ElementGroup.INLINE), 0); defineElement( "kbd", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "label", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "legend", false, elementGroupBits( ElementGroup.LEGEND_ELEMENT ), elementGroupBits( ElementGroup.INLINE )); defineElement( "li", false, elementGroupBits( ElementGroup.LI_ELEMENT ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "link", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.HEAD_CONTENT ), 0); defineElement( "listing", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.INLINE )); defineElement( "map", false, elementGroupBits( ElementGroup.INLINE ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.AREA_ELEMENT )); defineElement( "meta", false, elementGroupBits(ElementGroup.HEAD_CONTENT), 0); defineElement( "nobr", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "noframes", false, elementGroupBits( ElementGroup.BLOCK, ElementGroup.TOP_CONTENT ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE, ElementGroup.TOP_CONTENT )); defineElement( "noscript", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "object", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A, ElementGroup.HEAD_CONTENT ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE, ElementGroup.PARAM_ELEMENT )); defineElement( "ol", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.LI_ELEMENT )); defineElement( "optgroup", false, elementGroupBits( ElementGroup.OPTIONS_ELEMENT ), elementGroupBits( ElementGroup.OPTIONS_ELEMENT )); defineElement( "option", false, elementGroupBits( ElementGroup.OPTIONS_ELEMENT, ElementGroup.OPTION_ELEMENT ), 0); defineElement( "p", false, elementGroupBits( ElementGroup.BLOCK, ElementGroup.P_ELEMENT ), elementGroupBits( ElementGroup.INLINE, ElementGroup.TABLE_ELEMENT )); defineElement( "param", false, elementGroupBits(ElementGroup.PARAM_ELEMENT), 0); defineElement( "pre", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.INLINE )); defineElement( "q", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "s", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "samp", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "script", false, elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A, ElementGroup.MIXED, ElementGroup.TABLE_CONTENT, ElementGroup.HEAD_CONTENT, ElementGroup.TOP_CONTENT, ElementGroup.AREA_ELEMENT, ElementGroup.FORM_ELEMENT, ElementGroup.LEGEND_ELEMENT, ElementGroup.LI_ELEMENT, ElementGroup.DL_PART, ElementGroup.P_ELEMENT, ElementGroup.OPTIONS_ELEMENT, ElementGroup.OPTION_ELEMENT, ElementGroup.PARAM_ELEMENT, ElementGroup.TABLE_ELEMENT, ElementGroup.TR_ELEMENT, ElementGroup.TD_ELEMENT, ElementGroup.COL_ELEMENT ), 0); defineElement( "select", false, elementGroupBits( ElementGroup.INLINE ), elementGroupBits( ElementGroup.OPTIONS_ELEMENT )); defineElement( "small", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "span", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "strike", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "strong", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "style", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.HEAD_CONTENT ), 0); defineElement( "sub", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "sup", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "table", false, elementGroupBits( ElementGroup.BLOCK, ElementGroup.TABLE_ELEMENT ), elementGroupBits( ElementGroup.TABLE_CONTENT, ElementGroup.FORM_ELEMENT )); defineElement( "tbody", false, elementGroupBits( ElementGroup.TABLE_CONTENT ), elementGroupBits( ElementGroup.TR_ELEMENT )); defineElement( "td", false, elementGroupBits( ElementGroup.TD_ELEMENT ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "textarea", false, // No, a textarea cannot be inside a link. elementGroupBits(ElementGroup.INLINE), 0); defineElement( "tfoot", false, elementGroupBits( ElementGroup.TABLE_CONTENT ), elementGroupBits( ElementGroup.FORM_ELEMENT, ElementGroup.TR_ELEMENT, ElementGroup.TD_ELEMENT )); defineElement( "th", false, elementGroupBits( ElementGroup.TD_ELEMENT ), elementGroupBits( ElementGroup.BLOCK, ElementGroup.INLINE )); defineElement( "thead", false, elementGroupBits( ElementGroup.TABLE_CONTENT ), elementGroupBits( ElementGroup.FORM_ELEMENT, ElementGroup.TR_ELEMENT, ElementGroup.TD_ELEMENT )); defineElement( "title", false, elementGroupBits(ElementGroup.HEAD_CONTENT), 0); defineElement( "tr", false, elementGroupBits( ElementGroup.TABLE_CONTENT, ElementGroup.TR_ELEMENT ), elementGroupBits( ElementGroup.FORM_ELEMENT, ElementGroup.TD_ELEMENT )); defineElement( "tt", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "u", true, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "ul", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.LI_ELEMENT )); defineElement( "var", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), elementGroupBits( ElementGroup.INLINE )); defineElement( "video", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), 0); defineElement( "wbr", false, elementGroupBits( ElementGroup.INLINE, ElementGroup.INLINE_MINUS_A ), 0); defineElement( "xmp", false, elementGroupBits( ElementGroup.BLOCK ), elementGroupBits( ElementGroup.INLINE )); } } } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/HtmlTokenType.java�������������������0000664�0001750�0001750�00000005346�11654053470�030432� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; /** * Types of HTML tokens. * * @author Mike Samuel <mikesamuel@gmail.com> */ enum HtmlTokenType { /** * An HTML or XML attribute name consisting of characters other than * whitespace, =, or specials. */ ATTRNAME, /** An HTML value, possibly a quoted string. */ ATTRVALUE, /** An HTML or XML style comment, <tt>&lt;!-- for example --></tt>. */ COMMENT, /** * A directive such as a DOCTYPE declaration. */ DIRECTIVE, /** Unescaped tag, for instance, inside a script, or {@code xmp} tag. */ UNESCAPED, /** * A quoted string. Should not show up in well formed HTML, but may where * there is an attribute value without a corresponding name. */ QSTRING, /** * The beginning of a tag -- not to be confused with a start tag. * Valid tag beginnings include <tt>&lt;a</tt> and <tt>&lt;/a</tt>. The * rest of the tag is a series of attribute names, values, and the tag end. */ TAGBEGIN, /** The end of a tag. Either <tt>&gt;</tt> or <tt>/&gt;</tt>. */ TAGEND, /** A block of text, either inside a tag, or as element content. */ TEXT, /** Ignorable whitespace nodes. */ IGNORABLE, /** A server side script block a la php or jsp. */ SERVERCODE, ; } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/HtmlStreamRenderer.java��������������0000664�0001750�0001750�00000042116�11654053470�031426� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import com.google.common.annotations.VisibleForTesting; import java.io.Closeable; import java.io.Flushable; import java.io.IOException; import java.util.Iterator; import java.util.List; import javax.annotation.WillCloseWhenClosed; import javax.annotation.concurrent.NotThreadSafe; /** * Given a series of HTML tokens, writes valid, normalized HTML to the output. * The output will have well-defined tag boundaries, but there may be orphaned * or missing close and open tags. * The result of two renderers can always be concatenated to produce a larger * snippet of HTML, but if the first was called with * {@code writeOpenTag("plaintext", ...)}, then any tags in the second will not * be interpreted as tags in the concatenated version. */ @TCB @NotThreadSafe public class HtmlStreamRenderer implements HtmlStreamEventReceiver { private final Appendable output; private final Handler<? super IOException> ioExHandler; private final Handler<? super String> badHtmlHandler; private String lastTagOpened; private StringBuilder pendingUnescaped; private boolean open; /** * Factory. * @param output the buffer to which HTML is streamed. * @param ioExHandler called with any exception raised by output. * @param badHtmlHandler receives alerts when HTML cannot be rendered because * there is not valid HTML tree that results from that series of calls. * E.g. it is not possible to create an HTML {@code <style>} element whose * textual content is {@code "</style>"}. */ public static HtmlStreamRenderer create( @WillCloseWhenClosed Appendable output, Handler<? super IOException> ioExHandler, Handler<? super String> badHtmlHandler) { if (output instanceof Closeable) { return new CloseableHtmlStreamRenderer( output, ioExHandler, badHtmlHandler); } else { return new HtmlStreamRenderer(output, ioExHandler, badHtmlHandler); } } /** * Factory. * @param output the buffer to which HTML is streamed. * @param badHtmlHandler receives alerts when HTML cannot be rendered because * there is not valid HTML tree that results from that series of calls. * E.g. it is not possible to create an HTML {@code <style>} element whose * textual content is {@code "</style>"}. */ public static HtmlStreamRenderer create( StringBuilder output, Handler<? super String> badHtmlHandler) { // Propagate since StringBuilder should not throw IOExceptions. return create(output, Handler.PROPAGATE, badHtmlHandler); } private HtmlStreamRenderer( Appendable output, Handler<? super IOException> ioExHandler, Handler<? super String> badHtmlHandler) { this.output = output; this.ioExHandler = ioExHandler; this.badHtmlHandler = badHtmlHandler; } /** * Called when the series of calls make no sense. * May be overridden to throw an unchecked throwable, to log, or to take some * other action. * * @param message for human consumption. * @param identifier an HTML identifier associated with the message. */ private final void error(String message, CharSequence identifier) { if (badHtmlHandler != Handler.DO_NOTHING) { // Avoid string append. badHtmlHandler.handle(message + " : " + identifier); } } public final void openDocument() throws IllegalStateException { if (open) { throw new IllegalStateException(); } open = true; } public final void closeDocument() throws IllegalStateException { if (!open) { throw new IllegalStateException(); } if (pendingUnescaped != null) { closeTag(lastTagOpened); } open = false; if (output instanceof Flushable) { try { ((Flushable) output).flush(); } catch (IOException ex) { ioExHandler.handle(ex); } } } public final boolean isDocumentOpen() { return open; } public final void openTag(String elementName, List<String> attrs) { try { writeOpenTag(elementName, attrs); } catch (IOException ex) { ioExHandler.handle(ex); } } private void writeOpenTag(String elementName, List<? extends String> attrs) throws IOException { if (!open) { throw new IllegalStateException(); } elementName = safeName(elementName); if (!isValidHtmlName(elementName)) { error("Invalid element name", elementName); return; } if (pendingUnescaped != null) { error("Tag content cannot appear inside CDATA element", elementName); return; } switch (HtmlTextEscapingMode.getModeForTag(elementName)) { case CDATA_SOMETIMES: case CDATA: case PLAIN_TEXT: lastTagOpened = elementName; pendingUnescaped = new StringBuilder(); break; default: } output.append('<').append(elementName); for (Iterator<? extends String> attrIt = attrs.iterator(); attrIt.hasNext();) { String name = attrIt.next(); String value = attrIt.next(); name = HtmlLexer.canonicalName(name); if (!isValidHtmlName(name)) { error("Invalid attr name", name); continue; } output.append(' ').append(name).append('=').append('"'); escapeHtmlOnto(value, output); if (value.indexOf('`') != -1) { // Apparently, in quirks mode, IE8 does a poor job producing innerHTML // values. Given // <div attr="``foo=bar"> // we encode &#96; but if JavaScript does: // nodeA.innerHTML = nodeB.innerHTML; // and nodeB contains the DIV above, then IE8 will produce // <div attr=``foo=bar> // as the value of nodeB.innerHTML and assign it to nodeA. // IE8's HTML parser treats `` as a blank attribute value and foo=bar // becomes a separate attribute. // Adding a space at the end of the attribute prevents this by forcing // IE8 to put double quotes around the attribute when computing // nodeB.innerHTML. output.append(' '); } output.append('"'); } output.append('>'); } public final void closeTag(String elementName) { try { writeCloseTag(safeName(elementName)); } catch (IOException ex) { ioExHandler.handle(ex); } } private final void writeCloseTag(String elementName) throws IOException { if (!open) { throw new IllegalStateException(); } elementName = HtmlLexer.canonicalName(elementName); if (!isValidHtmlName(elementName)) { error("Invalid element name", elementName); return; } if (pendingUnescaped != null) { if (!lastTagOpened.equals(elementName)) { error("Tag content cannot appear inside CDATA element", elementName); return; } else { StringBuilder cdataContent = pendingUnescaped; pendingUnescaped = null; int problemIndex = checkHtmlCdataCloseable(lastTagOpened, cdataContent); if (problemIndex == -1) { output.append(cdataContent); } else { error( "Invalid CDATA text content", cdataContent.subSequence( problemIndex, Math.min(problemIndex + 10, cdataContent.length()))); // Still output the close tag. } } if ("plaintext".equals(elementName)) { return; } } output.append("</").append(elementName).append(">"); } public final void text(String text) { try { writeText(text); } catch (IOException ex) { ioExHandler.handle(ex); } } private final void writeText(String text) throws IOException { if (!open) { throw new IllegalStateException(); } if (pendingUnescaped != null) { pendingUnescaped.append(text.replaceAll("\0", "")); } else { escapeHtmlOnto(text, output); // Works for RCDATA. } } private static int checkHtmlCdataCloseable( String localName, StringBuilder sb) { int escapingTextSpanStart = -1; for (int i = 0, n = sb.length(); i < n; ++i) { char ch = sb.charAt(i); switch (ch) { case '<': if (i + 3 < n && '!' == sb.charAt(i + 1) && '-' == sb.charAt(i + 2) && '-' == sb.charAt(i + 3)) { if (escapingTextSpanStart == -1) { escapingTextSpanStart = i; } else { return i; } } else if (i + 1 + localName.length() < n && '/' == sb.charAt(i + 1) && Strings.regionMatchesIgnoreCase( sb, i + 2, localName, 0, localName.length())) { // A close tag contained in the content. if (escapingTextSpanStart < 0) { // We could try some recovery strategies here. // E.g. prepending "/<!--\n" to sb if "script".equals(localName) return i; } if (!"script".equals(localName)) { // Script tags are commonly included inside script tags. // <script><!--document.write('<script>f()</script>');--></script> // but this does not happen in other CDATA element types. // Actually allowing an end tag inside others is problematic. // Specifically, // <style><!--</style>-->/* foo */</style> // displays the text "/* foo */" on some browsers. return i; } } break; case '>': // From the HTML5 spec: // The text in style, script, title, and textarea elements must not // have an escaping text span start that is not followed by an // escaping text span end. // We look left since the HTML 5 spec allows the escaping text span // end to share dashes with the start. if (i >= 2 && '-' == sb.charAt(i - 1) && '-' == sb.charAt(i - 2)) { if (escapingTextSpanStart < 0) { return i - 2; } escapingTextSpanStart = -1; } break; } } if (escapingTextSpanStart >= 0) { // We could try recovery strategies here. // E.g. appending "//-->" to the buffer if "script".equals(localName) return escapingTextSpanStart; } return -1; } @VisibleForTesting static boolean isValidHtmlName(String name) { int n = name.length(); if (n == 0) { return false; } if (n > 128) { return false; } boolean isNamespaced = false; for (int i = 0; i < n; ++i) { char ch = name.charAt(i); switch (ch) { case ':': if (isNamespaced) { return false; } isNamespaced = true; if (i == 0 || i + 1 == n) { return false; } break; case '-': if (i == 0 || i + 1 == n) { return false; } break; default: if (ch <= '9') { if (i == 0 || ch < '0') { return false; } } else if ('A' <= ch && ch <= 'z') { if ('Z' < ch && ch < 'a') { return false; } } else { return false; } break; } } return true; } /** * Writes the HTML equivalent of the given plain text to output. * For example, {@code escapeHtmlOnto("1 < 2", w)}, * is equivalent to {@code w.append("1 &lt; 2")} but possibly with fewer * smaller appends. */ static void escapeHtmlOnto(String plainText, Appendable output) throws IOException { int n = plainText.length(); int pos = 0; for (int i = 0; i < n; ++i) { char ch = plainText.charAt(i); if (ch < REPLACEMENTS.length) { String repl = REPLACEMENTS[ch]; if (repl != null) { output.append(plainText, pos, i).append(repl); pos = i + 1; } } else if (Character.isHighSurrogate(ch) && i + 1 < n) { // Emit supplemental codepoints as entity so that they cannot // be mis-encoded as UTF-8 of surrogates instead of UTF-8 proper // and get involved in UTF-16/UCS-2 confusion. char next = plainText.charAt(i + 1); if (Character.isLowSurrogate(next)) { int codepoint = Character.toCodePoint(ch, next); output.append(plainText, pos, i); appendNumericEntity(codepoint, output); ++i; pos = i + 1; } // All orphaned surrogates are rendered raw. } else if (ch >= 0xff00) { // Is a control character or possible full-width version of a // special character. output.append(plainText, pos, i); appendNumericEntity(ch, output); pos = i + 1; } } output.append(plainText, pos, n); } /** Maps ASCII chars that need to be encoded to an equivalent HTML entity. */ static final String[] REPLACEMENTS = new String[0x61]; static { REPLACEMENTS[0] = ""; // Elide. for (int i = 1; i < ' '; ++i) { if (i == '\n' || i == '\r') { continue; } REPLACEMENTS[i] = "&#" + i + ";"; } REPLACEMENTS['"'] = "&#" + ((int) '"') + ";"; // Attribute delimiter. REPLACEMENTS['&'] = "&amp;"; // HTML special. REPLACEMENTS['\''] = "&#" + ((int) '\'') + ";";// Attribute delimiter. REPLACEMENTS['+'] = "&#" + ((int) '+') + ";"; // UTF-7 special. REPLACEMENTS['<'] = "&lt;"; // HTML special. REPLACEMENTS['='] = "&#" + ((int) '=') + ";"; // Special in attributes. REPLACEMENTS['>'] = "&gt;"; // HTML special. REPLACEMENTS['@'] = "&#" + ((int) '@') + ";"; // Conditional compilation. REPLACEMENTS['`'] = "&#" + ((int) '`') + ";"; // Attribute delimiter. } static void appendNumericEntity(int codepoint, Appendable output) throws IOException { if (codepoint < 100) { // TODO: is this dead code due to REPLACEMENTS above. output.append("&#"); if (codepoint < 10) { output.append((char) ('0' + codepoint)); } else { output.append((char) ('0' + (codepoint / 10))); output.append((char) ('0' + (codepoint % 10))); } output.append(";"); } else { int nDigits = (codepoint < 0x1000 ? codepoint < 0x100 ? 2 : 3 : (codepoint < 0x10000 ? 4 : codepoint < 0x100000 ? 5 : 6)); output.append("&#x"); for (int digit = nDigits; --digit >= 0;) { int hexDigit = (codepoint >>> (digit << 2)) & 0xf; output.append(HEX_NUMERAL[hexDigit]); } output.append(";"); } } private static final char[] HEX_NUMERAL = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', }; /** * Canonicalizes the element name and possibly substitutes an alternative * that has more consistent semantics. */ static String safeName(String elementName) { elementName = HtmlLexer.canonicalName(elementName); // Substitute a reliably non-raw-text element for raw-text and // plain-text elements. switch (elementName.length()) { case 3: if ("xmp".equals(elementName)) { return "pre"; } break; case 7: if ("listing".equals(elementName)) { return "pre"; } break; case 9: if ("plaintext".equals(elementName)) { return "pre"; } break; } return elementName; } static class CloseableHtmlStreamRenderer extends HtmlStreamRenderer implements Closeable { private final Closeable closeable; CloseableHtmlStreamRenderer( @WillCloseWhenClosed Appendable output, Handler<? super IOException> errorHandler, Handler<? super String> badHtmlHandler) { super(output, errorHandler, badHtmlHandler); this.closeable = (Closeable) output; } public void close() throws IOException { if (isDocumentOpen()) { closeDocument(); } closeable.close(); } } } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/TCB.java�����������������������������0000664�0001750�0001750�00000003660�11654053470�026270� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.lang.annotation.ElementType; import java.lang.annotation.Target; /** * Indicates that a program element is in the trusted computing base -- * there exists a security property that could be violated if this code is not * correct. */ @Target({ ElementType.CONSTRUCTOR, ElementType.METHOD, ElementType.PACKAGE, ElementType.TYPE }) public @interface TCB { // No members. } ��������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/Trie.java����������������������������0000664�0001750�0001750�00000013763�11654053470�026570� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.TreeMap; /** * A trie used to separate punctuation tokens in a run of non-whitespace * characters by preferring the longest punctuation string possible in a * greedy left-to-right scan. * * @author Mike Samuel <mikesamuel@gmail.com> */ final class Trie { private final char[] childMap; private final Trie[] children; private final boolean terminal; private final int value; /** * @param elements not empty, non null. */ public Trie(Map<String, Integer> elements) { this(sortedUniqEntries(elements), 0); } private Trie(List<Map.Entry<String, Integer>> elements, int depth) { this(elements, depth, 0, elements.size()); } /** * @param elements not empty, non null. Not modified. * @param depth the depth in the tree. * @param start an index into punctuationStrings of the first string in this * subtree. * @param end an index into punctuationStrings past the last string in this * subtree. */ private Trie( List<Map.Entry<String, Integer>> elements, int depth, int start, int end) { this.terminal = depth == elements.get(start).getKey().length(); if (this.terminal) { this.value = elements.get(start).getValue(); if (start + 1 == end) { // base case this.childMap = ZERO_CHARS; this.children = ZERO_TRIES; return; } else { ++start; } } else { this.value = Integer.MAX_VALUE; } int childCount = 0; { int last = -1; for (int i = start; i < end; ++i) { char ch = elements.get(i).getKey().charAt(depth); if (ch != last) { ++childCount; last = ch; } } } this.childMap = new char[childCount]; this.children = new Trie[childCount]; int childStart = start; int childIndex = 0; char lastCh = elements.get(start).getKey().charAt(depth); for (int i = start + 1; i < end; ++i) { char ch = elements.get(i).getKey().charAt(depth); if (ch != lastCh) { childMap[childIndex] = lastCh; children[childIndex++] = new Trie( elements, depth + 1, childStart, i); childStart = i; lastCh = ch; } } childMap[childIndex] = lastCh; children[childIndex++] = new Trie(elements, depth + 1, childStart, end); } /** Does this node correspond to a complete string in the input set. */ public boolean isTerminal() { return terminal; } public int getValue() { return value; } /** * The child corresponding to the given character. * @return null if no such trie. */ public Trie lookup(char ch) { int i = Arrays.binarySearch(childMap, ch); return i >= 0 ? children[i] : null; } /** * The descendant of this trie corresponding to the string for this trie * appended with s. * @param s non null. * @return null if no such trie. */ public Trie lookup(CharSequence s) { Trie t = this; for (int i = 0, n = s.length(); i < n; ++i) { t = t.lookup(s.charAt(i)); if (null == t) { break; } } return t; } public boolean contains(char ch) { return Arrays.binarySearch(childMap, ch) >= 0; } private static <T> List<Map.Entry<String, T>> sortedUniqEntries( Map<String, T> m) { return new ArrayList<Map.Entry<String, T>>( new TreeMap<String, T>(m).entrySet()); } private static final char[] ZERO_CHARS = new char[0]; private static final Trie[] ZERO_TRIES = new Trie[0]; /** * Append all strings s such that {@code this.lookup(s).isTerminal()} to the * given list in lexical order. */ public void toStringList(List<String> strings) { toStringList("", strings); } private void toStringList(String prefix, List<String> strings) { if (terminal) { strings.add(prefix); } for (int i = 0, n = childMap.length; i < n; ++i) { children[i].toStringList(prefix + childMap[i], strings); } } @Override public String toString() { StringBuilder sb = new StringBuilder(); toStringBuilder(0, sb); return sb.toString(); } private void toStringBuilder(int depth, StringBuilder sb) { sb.append(terminal ? "terminal" : "nonterminal"); ++depth; for (int i = 0; i < childMap.length; ++i) { sb.append('\n'); for (int d = 0; d < depth; ++d) { sb.append('\t'); } sb.append('\'').append(childMap[i]).append("' "); children[i].toStringBuilder(depth, sb); } } } �������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/Strings.java�������������������������0000664�0001750�0001750�00000012204�11654053470�027303� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import javax.annotation.Nullable; /** * Locale independent versions of String case-insensitive operations. * <p> * The normal case insensitive operators {@link String#toLowerCase} * and {@link String#equalsIgnoreCase} depend upon the current locale. * They will fold the letters "i" and "I" differently if the locale is * Turkish than if it is English. * <p> * These operations ignore all case folding for non-Roman letters, and are * independent of the current locale. * Lower-casing is exactly equivalent to {@code tr/A-Z/a-z/}, upper-casing to * {@code tr/a-z/A-Z/}, and case insensitive comparison is equivalent to * lower-casing both then comparing by code-unit. * <p> * Because of this simpler case folding, it is the case that for all Strings s * <code> * Strings.toUpperCase(s).equals(Strings.toUpperCase(Strings.toLowerCase(s))) * </code>. * * @author Mike Samuel <mikesamuel@gmail.com> */ final class Strings { public static boolean equalsIgnoreCase( @Nullable String a, @Nullable String b) { if (a == null) { return b == null; } if (b == null) { return false; } int length = a.length(); if (b.length() != length) { return false; } for (int i = length; --i >= 0;) { char c = a.charAt(i), d = b.charAt(i); if (c != d) { if (c <= 'z' && c >= 'A') { if (c <= 'Z') { c |= 0x20; } if (d <= 'Z' && d >= 'A') { d |= 0x20; } if (c == d) { continue; } } return false; } } return true; } public static boolean regionMatchesIgnoreCase( CharSequence a, int aoffset, CharSequence b, int boffset, int n) { if (aoffset + n > a.length() || boffset + n > b.length()) { return false; } for (int i = n; --i >= 0;) { char c = a.charAt(aoffset + i), d = b.charAt(boffset + i); if (c != d) { if (c <= 'z' && c >= 'A') { if (c <= 'Z') { c |= 0x20; } if (d <= 'Z' && d >= 'A') { d |= 0x20; } if (c == d) { continue; } } return false; } } return true; } /** True iff {@code s.equals(String.toLowerCase(s))}. */ public static boolean isLowerCase(CharSequence s) { for (int i = s.length(); --i >= 0;) { char c = s.charAt(i); if (c <= 'Z' && c >= 'A') { return false; } } return true; } private static final char[] LCASE_CHARS = new char['Z' + 1]; private static final char[] UCASE_CHARS = new char['z' + 1]; static { for (int i = 0; i < 'A'; ++i) { LCASE_CHARS[i] = (char) i; } for (int i = 'A'; i <= 'Z'; ++i) { LCASE_CHARS[i] = (char) (i | 0x20); } for (int i = 0; i < 'a'; ++i) { UCASE_CHARS[i] = (char) i; } for (int i = 'a'; i <= 'z'; ++i) { UCASE_CHARS[i] = (char) (i & ~0x20); } } public static String toLowerCase(String s) { for (int i = s.length(); --i >= 0;) { char c = s.charAt(i); if (c <= 'Z' && c >= 'A') { char[] chars = s.toCharArray(); chars[i] = LCASE_CHARS[c]; while (--i >= 0) { c = chars[i]; if (c <= 'Z') { chars[i] = LCASE_CHARS[c]; } } return String.valueOf(chars); } } return s; } public static String toUpperCase(String s) { for (int i = s.length(); --i >= 0;) { char c = s.charAt(i); if (c <= 'z' && c >= 'a') { char[] chars = s.toCharArray(); chars[i] = UCASE_CHARS[c]; while (--i >= 0) { c = chars[i]; if (c <= 'z') { chars[i] = UCASE_CHARS[c]; } } return String.valueOf(chars); } } return s; } private Strings() { /* uninstantiable */ } } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/AttributePolicy.java�����������������0000664�0001750�0001750�00000011166�11654053470�031003� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import javax.annotation.Nullable; import javax.annotation.concurrent.Immutable; /** * A policy that can be applied to an HTML attribute to decide whether or not to * allow it in the output, possibly after transforming its value. * * @author Mike Samuel <mikesamuel@gmail.com> * @see HtmlPolicyBuilder.AttributeBuilder#matching(AttributePolicy) */ @TCB public interface AttributePolicy { /** * @param elementName the lower-case element name. * @param attributeName the lower-case attribute name. * @param value the attribute value without quotes and with HTML entities * decoded. * * @return {@code null} to disallow the attribute or the adjusted value if * allowed. */ public @Nullable String apply( String elementName, String attributeName, String value); /** Utilities for working with attribute policies. */ public static final class Util { /** * An attribute policy equivalent to applying all the given policies in * order, failing early if any of them fails. */ public static final AttributePolicy join(AttributePolicy... policies) { class PolicyJoiner { AttributePolicy last = null; AttributePolicy out = null; void join(AttributePolicy p) { if (REJECT_ALL_ATTRIBUTE_POLICY.equals(p)) { out = p; } else if (!REJECT_ALL_ATTRIBUTE_POLICY.equals(out)) { if (p instanceof JoinedAttributePolicy) { JoinedAttributePolicy jap = (JoinedAttributePolicy) p; join(jap.first); join(jap.second); } else if (p != last) { last = p; if (out == null || IDENTITY_ATTRIBUTE_POLICY.equals(out)) { out = p; } else if (!IDENTITY_ATTRIBUTE_POLICY.equals(p)) { out = new JoinedAttributePolicy(out, p); } } } } } PolicyJoiner pu = new PolicyJoiner(); for (AttributePolicy policy : policies) { if (policy == null) { continue; } pu.join(policy); } return pu.out != null ? pu.out : IDENTITY_ATTRIBUTE_POLICY; } } public static final AttributePolicy IDENTITY_ATTRIBUTE_POLICY = new AttributePolicy() { public String apply( String elementName, String attributeName, String value) { return value; } }; public static final AttributePolicy REJECT_ALL_ATTRIBUTE_POLICY = new AttributePolicy() { public @Nullable String apply( String elementName, String attributeName, String value) { return null; } }; } @Immutable final class JoinedAttributePolicy implements AttributePolicy { final AttributePolicy first, second; JoinedAttributePolicy(AttributePolicy first, AttributePolicy second) { this.first = first; this.second = second; } public @Nullable String apply( String elementName, String attributeName, String value) { value = first.apply(elementName, attributeName, value); return value != null ? second.apply(elementName, attributeName, value) : null; } } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/examples/����������������������������0000775�0001750�0001750�00000000000�11654053470�026626� 5����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/examples/SlashdotPolicyExample.java��0000664�0001750�0001750�00000012560�11654053470�033752� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html.examples; import java.io.IOException; import java.io.InputStreamReader; import java.util.regex.Pattern; import org.owasp.html.Handler; import org.owasp.html.HtmlPolicyBuilder; import org.owasp.html.HtmlSanitizer; import org.owasp.html.HtmlStreamEventReceiver; import org.owasp.html.HtmlStreamRenderer; import com.google.common.base.Charsets; import com.google.common.base.Function; import com.google.common.base.Throwables; import com.google.common.io.CharStreams; /** * Based on the * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy Slashdot example</a>. * <blockquote> * Slashdot (http://www.slashdot.org/) is a techie news site that allows users * to respond anonymously to news posts with very limited HTML markup. Now * Slashdot is not only one of the coolest sites around, it's also one that's * been subject to many different successful attacks. Even more unfortunate is * the fact that most of the attacks led users to the infamous goatse.cx picture * (please don't go look it up). The rules for Slashdot are fairly strict: users * can only submit the following HTML tags and no CSS: {@code <b>}, {@code <u>}, * {@code <i>}, {@code <a>}, {@code <blockquote>}. * <br> * Accordingly, we've built a policy file that allows fairly similar * functionality. All text-formatting tags that operate directly on the font, * color or emphasis have been allowed. * </blockquote> */ public class SlashdotPolicyExample { /** A policy definition that matches the minimal HTML that Slashdot allows. */ public static final Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy> POLICY_DEFINITION = new HtmlPolicyBuilder() .allowStandardUrlProtocols() // Allow title="..." on any element. .allowAttributes("title").globally() // Allow href="..." on <a> elements. .allowAttributes("href").onElements("a") // Defeat link spammers. .requireRelNofollowOnLinks() // Allow lang= with an alphabetic value on any element. .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}")) .globally() // The align attribute on <p> elements can have any value below. .allowAttributes("align") .matching(true, "center", "left", "right", "justify", "char") .onElements("p") // These elements are allowed. .allowElements( "a", "p", "div", "i", "b", "em", "blockquote", "tt", "strong", "br", "ul", "ol", "li") // Custom slashdot tags. // These could be rewritten in the sanitizer using an ElementPolicy. .allowElements("quote", "ecode") .toFactory(); public static void main(String[] args) throws IOException { if (args.length != 0) { System.err.println("Reads from STDIN and writes to STDOUT"); System.exit(-1); } System.err.println("[Reading from STDIN]"); // Fetch the HTML to sanitize. String html = CharStreams.toString( new InputStreamReader(System.in, Charsets.UTF_8)); // Set up an output channel to receive the sanitized HTML. HtmlStreamRenderer renderer = HtmlStreamRenderer.create( System.out, // Receives notifications on a failure to write to the output. new Handler<IOException>() { public void handle(IOException ex) { Throwables.propagate(ex); // System.out suppresses IOExceptions } }, // Our HTML parser is very lenient, but this receives notifications on // truly bizarre inputs. new Handler<String>() { public void handle(String x) { throw new AssertionError(x); } }); // Use the policy defined above to sanitize the HTML. HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer)); } } ������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/examples/EbayPolicyExample.java������0000664�0001750�0001750�00000025304�11654053470�033051� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html.examples; import java.io.IOException; import java.io.InputStreamReader; import java.util.regex.Pattern; import org.owasp.html.Handler; import org.owasp.html.HtmlPolicyBuilder; import org.owasp.html.HtmlSanitizer; import org.owasp.html.HtmlStreamEventReceiver; import org.owasp.html.HtmlStreamRenderer; import com.google.common.base.Charsets; import com.google.common.base.Function; import com.google.common.base.Predicate; import com.google.common.base.Throwables; import com.google.common.io.CharStreams; /** * Based on the * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy EBay example</a>. * <blockquote> * eBay (http://www.ebay.com/) is the most popular online auction site in the * universe, as far as I can tell. It is a public site so anyone is allowed to * post listings with rich HTML content. It's not surprising that given the * attractiveness of eBay as a target that it has been subject to a few complex * XSS attacks. Listings are allowed to contain much more rich content than, * say, Slashdot- so it's attack surface is considerably larger. The following * tags appear to be accepted by eBay (they don't publish rules): * {@code <a>},... * </blockquote> */ public class EbayPolicyExample { // Some common regular expression definitions. // The 16 colors defined by the HTML Spec (also used by the CSS Spec) private static final Pattern COLOR_NAME = Pattern.compile( "(?:aqua|black|blue|fuchsia|gray|grey|green|lime|maroon|navy|olive|purple" + "|red|silver|teal|white|yellow)"); // HTML/CSS Spec allows 3 or 6 digit hex to specify color private static final Pattern COLOR_CODE = Pattern.compile( "(?:#(?:[0-9a-fA-F]{3}(?:[0-9a-fA-F]{3})?))"); private static final Pattern NUMBER_OR_PERCENT = Pattern.compile( "[0-9]+%?"); private static final Pattern PARAGRAPH = Pattern.compile( "(?:[\\p{L}\\p{N},'\\.\\s\\-_\\(\\)]|&[0-9]{2};)*"); private static final Pattern HTML_ID = Pattern.compile( "[a-zA-Z0-9\\:\\-_\\.]+"); // force non-empty with a '+' at the end instead of '*' private static final Pattern HTML_TITLE = Pattern.compile( "[\\p{L}\\p{N}\\s\\-_',:\\[\\]!\\./\\\\\\(\\)&]*"); private static final Pattern HTML_CLASS = Pattern.compile( "[a-zA-Z0-9\\s,\\-_]+"); private static final Pattern ONSITE_URL = Pattern.compile( "(?:[\\p{L}\\p{N}\\\\\\.\\#@\\$%\\+&;\\-_~,\\?=/!]+|\\#(\\w)+)"); private static final Pattern OFFSITE_URL = Pattern.compile( "\\s*(?:(?:ht|f)tps?://|mailto:)[\\p{L}\\p{N}]" + "[\\p{L}\\p{N}\\p{Zs}\\.\\#@\\$%\\+&;:\\-_~,\\?=/!\\(\\)]*\\s*"); private static final Pattern NUMBER = Pattern.compile( "[+-]?(?:(?:[0-9]+(?:\\.[0-9]*)?)|\\.[0-9]+)"); private static final Pattern NAME = Pattern.compile("[a-zA-Z0-9\\-_\\$]+"); private static final Pattern ALIGN = Pattern.compile( "(?i)center|left|right|justify|char"); private static final Pattern VALIGN = Pattern.compile( "(?i)baseline|bottom|middle|top"); private static final Predicate<String> COLOR_NAME_OR_COLOR_CODE = new Predicate<String>() { public boolean apply(String s) { return COLOR_NAME.matcher(s).matches() || COLOR_CODE.matcher(s).matches(); } }; private static final Predicate<String> ONSITE_OR_OFFSITE_URL = new Predicate<String>() { public boolean apply(String s) { return ONSITE_URL.matcher(s).matches() || OFFSITE_URL.matcher(s).matches(); } }; private static final Pattern HISTORY_BACK = Pattern.compile( "(?:javascript:)?\\Qhistory.go(-1)\\E"); private static final Pattern ONE_CHAR = Pattern.compile( ".?", Pattern.DOTALL); public static final Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy> POLICY_DEFINITION = new HtmlPolicyBuilder() .allowAttributes("id").matching(HTML_ID).globally() .allowAttributes("class").matching(HTML_CLASS).globally() .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}")) .globally() .allowAttributes("title").matching(HTML_TITLE).globally() .allowStyling() .allowAttributes("align").matching(ALIGN).onElements("p") .allowAttributes("for").matching(HTML_ID).onElements("label") .allowAttributes("color").matching(COLOR_NAME_OR_COLOR_CODE) .onElements("font") .allowAttributes("face") .matching(Pattern.compile("[\\w;, \\-]+")) .onElements("font") .allowAttributes("size").matching(NUMBER).onElements("font") .allowAttributes("href").matching(ONSITE_OR_OFFSITE_URL) .onElements("a") .allowStandardUrlProtocols() .allowAttributes("nohref").onElements("a") .allowAttributes("name").matching(NAME).onElements("a") .allowAttributes( "onfocus", "onblur", "onclick", "onmousedown", "onmouseup") .matching(HISTORY_BACK).onElements("a") .requireRelNofollowOnLinks() .allowAttributes("src").matching(ONSITE_OR_OFFSITE_URL) .onElements("img") .allowAttributes("name").matching(NAME) .onElements("img") .allowAttributes("alt").matching(PARAGRAPH) .onElements("img") .allowAttributes("border", "hspace", "vspace").matching(NUMBER) .onElements("img") .allowAttributes("border", "cellpadding", "cellspacing") .matching(NUMBER).onElements("table") .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE) .onElements("table") .allowAttributes("background").matching(ONSITE_URL) .onElements("table") .allowAttributes("align").matching(ALIGN) .onElements("table") .allowAttributes("noresize").matching(Pattern.compile("(?i)noresize")) .onElements("table") .allowAttributes("background").matching(ONSITE_URL) .onElements("td", "th", "tr") .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE) .onElements("td", "th") .allowAttributes("abbr").matching(PARAGRAPH) .onElements("td", "th") .allowAttributes("axis", "headers").matching(NAME) .onElements("td", "th") .allowAttributes("scope") .matching(Pattern.compile("(?i)(?:row|col)(?:group)?")) .onElements("td", "th") .allowAttributes("nowrap") .onElements("td", "th") .allowAttributes("height", "width").matching(NUMBER_OR_PERCENT) .onElements("table", "td", "th", "tr", "img") .allowAttributes("align").matching(ALIGN) .onElements("thead", "tbody", "tfoot", "img", "td", "th", "tr", "colgroup", "col") .allowAttributes("valign").matching(VALIGN) .onElements("thead", "tbody", "tfoot", "td", "th", "tr", "colgroup", "col") .allowAttributes("charoff").matching(NUMBER_OR_PERCENT) .onElements("td", "th", "tr", "colgroup", "col", "thead", "tbody", "tfoot") .allowAttributes("char").matching(ONE_CHAR) .onElements("td", "th", "tr", "colgroup", "col", "thead", "tbody", "tfoot") .allowAttributes("colspan", "rowspan").matching(NUMBER) .onElements("td", "th") .allowAttributes("span", "width").matching(NUMBER_OR_PERCENT) .onElements("colgroup", "col") .allowElements( "label", "noscript", "h1", "h2", "h3", "h4", "h5", "h6", "p", "i", "b", "u", "strong", "em", "small", "big", "pre", "code", "cite", "samp", "sub", "sup", "strike", "center", "blockquote", "hr", "br", "col", "font", "map", "span", "div", "img", "ul", "ol", "li", "dd", "dt", "dl", "tbody", "thead", "tfoot", "table", "td", "th", "tr", "colgroup", "fieldset", "legend") .toFactory(); public static void main(String[] args) throws IOException { if (args.length != 0) { System.err.println("Reads from STDIN and writes to STDOUT"); System.exit(-1); } System.err.println("[Reading from STDIN]"); // Fetch the HTML to sanitize. String html = CharStreams.toString( new InputStreamReader(System.in, Charsets.UTF_8)); // Set up an output channel to receive the sanitized HTML. HtmlStreamRenderer renderer = HtmlStreamRenderer.create( System.out, // Receives notifications on a failure to write to the output. new Handler<IOException>() { public void handle(IOException ex) { Throwables.propagate(ex); // System.out suppresses IOExceptions } }, // Our HTML parser is very lenient, but this receives notifications on // truly bizarre inputs. new Handler<String>() { public void handle(String x) { throw new AssertionError(x); } }); // Use the policy defined above to sanitize the HTML. HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer)); } } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/ElementPolicy.java�������������������0000664�0001750�0001750�00000011236�11654053470�030427� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.List; import javax.annotation.Nullable; import javax.annotation.concurrent.Immutable; /** * A policy that can be applied to an element to decide whether or not to * allow it in the output, possibly after transforming attributes. * <p> * Element policies are applied <strong>after</strong> * {@link AttributePolicy attribute policies} so * they can be used to add extra attributes. * * @author Mike Samuel <mikesamuel@gmail.com> * @see HtmlPolicyBuilder#allowElements(ElementPolicy, String...) */ @TCB public interface ElementPolicy { /** * @param elementName the lower-case element name. * @param attrs a list of alternating attribute names and values. * The list may be added to or removed from. When removing, be * careful to remove both the name and its associated value. * * @return {@code null} to disallow the element, or the adjusted element name. */ public @Nullable String apply(String elementName, List<String> attrs); /** Utilities for working with element policies. */ public static final class Util { private Util() { /* uninstantiable */ } /** * Given zero or more element policies, returns an element policy equivalent * to applying them in order failing early if any of them fails. */ public static final ElementPolicy join(ElementPolicy... policies) { class PolicyJoiner { ElementPolicy last = null; ElementPolicy out = null; void join(ElementPolicy p) { if (p == REJECT_ALL_ELEMENT_POLICY) { out = p; } else if (out != REJECT_ALL_ELEMENT_POLICY) { if (p instanceof JoinedElementPolicy) { JoinedElementPolicy jep = (JoinedElementPolicy) p; join(jep.first); join(jep.second); } else if (p != last) { last = p; if (out == null || out == IDENTITY_ELEMENT_POLICY) { out = p; } else if (p != IDENTITY_ELEMENT_POLICY) { out = new JoinedElementPolicy(out, p); } } } } } PolicyJoiner pu = new PolicyJoiner(); for (ElementPolicy policy : policies) { if (policy == null) { continue; } pu.join(policy); } return pu.out != null ? pu.out : IDENTITY_ELEMENT_POLICY; } } public static final ElementPolicy IDENTITY_ELEMENT_POLICY = new ElementPolicy() { public String apply(String elementName, List<String> attrs) { return elementName; } }; public static final ElementPolicy REJECT_ALL_ELEMENT_POLICY = new ElementPolicy() { public @Nullable String apply(String elementName, List<String> attrs) { return null; } }; } @Immutable final class JoinedElementPolicy implements ElementPolicy { final ElementPolicy first, second; JoinedElementPolicy(ElementPolicy first, ElementPolicy second) { this.first = first; this.second = second; } public @Nullable String apply(String elementName, List<String> attrs) { elementName = first.apply(elementName, attrs); return elementName != null ? second.apply(elementName, attrs) : null; } } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/package-info.java��������������������0000664�0001750�0001750�00000003445�11654053470�030205� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. /** * An efficient {@link org.owasp.html.HtmlSanitizer HtmlSanitizer} * configurable via a flexible * {@link org.owasp.html.HtmlPolicyBuilder HtmlPolicyBuilder}. * * @author Mike Samuel <mikesamuel@gmail.com> */ @javax.annotation.ParametersAreNonnullByDefault package org.owasp.html; ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/Handler.java�������������������������0000664�0001750�0001750�00000004265�11654053470�027237� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import com.google.common.base.Throwables; /** * Receives notification of problems. * * @author Mike Samuel <mikesamuel@gmail.com> */ public interface Handler<T> { void handle(T x); /** A handler that does nothing given any input. */ public static final Handler<Object> DO_NOTHING = new Handler<Object>() { public void handle(Object x) { // Really, do nothing. } }; /** * A handler that re-raises an error, wrapping it in a runtime exception if * necessary. */ public static final Handler<Throwable> PROPAGATE = new Handler<Throwable>() { public void handle(Throwable th) { Throwables.propagate(th); } }; } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/StandardUrlAttributePolicy.java������0000664�0001750�0001750�00000005463�11654053470�033152� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; /** * A URL checker optimized to avoid object allocation for the common case: * {@code http}, {@code https}, {@code mailto}. */ @TCB final class StandardUrlAttributePolicy implements AttributePolicy { static final StandardUrlAttributePolicy INSTANCE = new StandardUrlAttributePolicy(); private StandardUrlAttributePolicy() { /* singleton */ } public String apply(String elementName, String attributeName, String s) { protocol_loop: for (int i = 0, n = s.length(); i < n; ++i) { switch (s.charAt(i)) { case '/': case '#': case '?': // No protocol. break protocol_loop; case ':': switch (i) { case 4: if (!Strings.regionMatchesIgnoreCase("http", 0, s, 0, 4)) { return null; } break; case 5: if (!Strings.regionMatchesIgnoreCase("https", 0, s, 0, 5)) { return null; } break; case 6: if (!Strings.regionMatchesIgnoreCase("mailto", 0, s, 0, 6)) { return null; } break; default: return null; } break protocol_loop; } } return FilterUrlByProtocolAttributePolicy.normalizeUri(s); } }�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/HtmlSanitizer.java�������������������0000664�0001750�0001750�00000021770�11654053470�030457� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.LinkedList; import java.util.List; import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; /** * Consumes an HTML stream, and dispatches events to a policy object which * decides which elements and attributes to allow. */ public final class HtmlSanitizer { /** * Receives events based on the HTML stream, and applies a policy to decide * what HTML constructs to allow. * Typically, implementations use an {@link HtmlStreamRenderer} to produce * the sanitized output. * * <p> * <b>Implementations of this class are in the TCB.</b></p> */ @TCB public interface Policy extends HtmlStreamEventReceiver { /** * Called when an HTML tag like {@code <foo bar=baz>} is seen in the input. * * @param elementName a normalized (lower-case for non-namespaced names) * element name. * @param attrs a list of alternating attribute name and value pairs. * For efficiency, this list may be mutated by this during this method * call, but ownership reverts to the caller on method exit. * The values are raw -- HTML entities have been decoded. * Specifically, implementations are allowed to use a list iterator * and remove all disallowed attributes, add necessary attributes, and * then pass the list to an {@link HtmlStreamRenderer}. */ void openTag(String elementName, List<String> attrs); /** * Called when an HTML tag like {@code </foo>} is seen in the input. * * @param elementName a normalized (lower-case for non-namespaced names) * element name. */ void closeTag(String elementName); /** * Called when textual content is seen. * @param textChunk raw content -- HTML entities have been decoded. */ void text(String textChunk); } /** * Sanitizes the given HTML by applying the given policy to it. * * <p> * This method is not in the TCB. * * <p> * This method has no return value since policies are assumed to render things * they accept and do nothing on things they reject. * Use {@link HtmlStreamRenderer} to render content to an output buffer. * * @param html A snippet of HTML to sanitize. {@code null} is treated as the * empty string and will not result in a {@code NullPointerException}. * @param policy The Policy that will receive events based on the tokens in * html. Typically, this policy ends up routing the events to an * {@link HtmlStreamRenderer} after filtering. * {@link HtmlPolicyBuilder} provides an easy way to create policies. */ public static void sanitize(@Nullable String html, final Policy policy) { if (html == null) { html = ""; } TagBalancingHtmlStreamEventReceiver balancer = new TagBalancingHtmlStreamEventReceiver(policy); // According to Opera the maximum table nesting depth seen in the wild is // 795, but 99.99% of documents have a table nesting depth of less than 22. // Since each table has a nesting depth of 4 (incl. TBODY), this leads to a // document depth of 90 (incl. HTML & BODY). // Obviously table nesting depth is not the same as whole document depth, // but it is the best proxy I have available. // See http://devfiles.myopera.com/articles/590/maxtabledepth-url.htm for // the original data. // Webkit defines the maximum HTML parser tree depth as 512. // http://trac.webkit.org/browser/trunk/Source/WebCore/page/Settings.h#L408 // static const unsigned defaultMaximumHTMLParserDOMTreeDepth = 512; // The first number gives us a lower bound on the nesting depth we allow, // 90, and the second gives us an upper bound: 512. // We do not want to bump right up against that limit. // 256 is substantially larger than the lower bound and well clear of the // upper bound. balancer.setNestingLimit(256); balancer.openDocument(); HtmlLexer lexer = new HtmlLexer(html); // Use a linked list so that policies can use Iterator.remove() in an O(1) // way. LinkedList<String> attrs = Lists.newLinkedList(); while (lexer.hasNext()) { HtmlToken token = lexer.next(); switch (token.type) { case TEXT: balancer.text(decodeHtml(html.substring(token.start, token.end))); break; case UNESCAPED: balancer.text(html.substring(token.start, token.end)); break; case TAGBEGIN: if (html.charAt(token.start + 1) == '/') { // A close tag. balancer.closeTag(HtmlLexer.canonicalName( html.substring(token.start + 2, token.end))); while (lexer.hasNext() && lexer.next().type != HtmlTokenType.TAGEND) { // skip tokens until we see a ">" } } else { attrs.clear(); boolean attrsReadyForName = true; tagBody: while (lexer.hasNext()) { HtmlToken tagBodyToken = lexer.next(); switch (tagBodyToken.type) { case ATTRNAME: if (!attrsReadyForName) { // Last attribute added was valueless. attrs.add(attrs.getLast()); } else { attrsReadyForName = false; } attrs.add(HtmlLexer.canonicalName( html.substring(tagBodyToken.start, tagBodyToken.end))); break; case ATTRVALUE: attrs.add(decodeHtml(stripQuotes( html.substring(tagBodyToken.start, tagBodyToken.end)))); attrsReadyForName = true; break; case TAGEND: break tagBody; default: // Just drop anything not recognized } } if (!attrsReadyForName) { attrs.add(attrs.getLast()); } balancer.openTag( HtmlLexer.canonicalName( html.substring(token.start + 1, token.end)), attrs); } break; default: // Ignore comments, directives, and other stuff that shouldn't show // up in the output. break; } } balancer.closeDocument(); } private static String stripQuotes(String encodedAttributeValue) { int n = encodedAttributeValue.length(); if (n > 0) { char last = encodedAttributeValue.charAt(n - 1); if (last == '"' || last == '\'') { int start = 0; if (n != 1 && last == encodedAttributeValue.charAt(0)) { start = 1; } else { // Browsers deal with missing left quotes : <img src=foo.png"> // but generally do not deal with missing right : <img src="foo.png> } return encodedAttributeValue.substring(start, n - 1); } } return encodedAttributeValue; } @VisibleForTesting static String decodeHtml(String s) { int amp = s.indexOf('&'); if (amp < 0) { return s; } int pos = 0; int n = s.length(); StringBuilder sb = new StringBuilder(n); int end; do { long endAndCodepoint = HtmlEntities.decodeEntityAt(s, amp, n); end = (int) (endAndCodepoint >>> 32); int codepoint = (int) endAndCodepoint; sb.append(s, pos, amp).appendCodePoint(codepoint); pos = end; } while ((amp = s.indexOf('&', end)) >= 0); return sb.append(s, pos, n).toString(); } } ��������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/PolicyFactory.java�������������������0000664�0001750�0001750�00000010051�11654053470�030437� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.Map; import javax.annotation.Nullable; import javax.annotation.concurrent.Immutable; import javax.annotation.concurrent.ThreadSafe; import com.google.common.base.Function; import com.google.common.collect.ImmutableMap; /** * A factory that can be used to link a sanitizer to an output receiver and that * provides a convenient <code>{@link PolicyFactory#sanitize sanitize}</code> * method and a <code>{@link PolicyFactory#and and}</code> method to compose * policies. * * @author Mike Samuel <mikesamuel@gmail.com> */ @ThreadSafe @Immutable @TCB public final class PolicyFactory implements Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy> { private final ImmutableMap<String, ElementAndAttributePolicies> policies; private final boolean allowStyling; PolicyFactory(ImmutableMap<String, ElementAndAttributePolicies> policies, boolean allowStyling) { this.policies = policies; this.allowStyling = allowStyling; } /** Produces a sanitizer that emits tokens to out. */ public HtmlSanitizer.Policy apply(HtmlStreamEventReceiver out) { if (allowStyling) { return new StylingPolicy(out, policies); } else { return new ElementAndAttributePolicyBasedSanitizerPolicy( out, policies); } } /** A convenience function that sanitizes a string of HTML. */ public String sanitize(@Nullable String html) { if (html == null) { return ""; } StringBuilder out = new StringBuilder(html.length()); HtmlSanitizer.sanitize( html, apply(HtmlStreamRenderer.create(out, Handler.DO_NOTHING))); return out.toString(); } /** * Produces a factory that allows the union of the grants, and intersects * policies where they overlap on a particular granted attribute or element * name. */ public PolicyFactory and(PolicyFactory f) { ImmutableMap.Builder<String, ElementAndAttributePolicies> b = ImmutableMap.builder(); for (Map.Entry<String, ElementAndAttributePolicies> e : policies.entrySet()) { String elName = e.getKey(); ElementAndAttributePolicies p = e.getValue(); ElementAndAttributePolicies q = f.policies.get(elName); if (q != null) { p = p.and(q); } b.put(elName, p); } for (Map.Entry<String, ElementAndAttributePolicies> e : f.policies.entrySet()) { String elName = e.getKey(); if (!policies.containsKey(elName)) { b.put(elName, e.getValue()); } } return new PolicyFactory(b.build(), allowStyling || f.allowStyling); } } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/Sanitizers.java����������������������0000664�0001750�0001750�00000007751�11654053470�030020� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; /** * Pre-packaged HTML sanitizer policies. * * <p> * These policies can be used to sanitize content. * </p> * <pre> * Sanitizers.FORMATTING.sanitize({@code "<b>Hello, World!</b>"}) * </pre> * and can be chained * <pre> * PolicyFactory sanitizer = Sanitizers.FORMATTING.and(Sanitizers.BLOCKS); * System.out.println(sanitizer.sanitize({@code "<p>Hello, <b>World!</b>"})); * </pre> * * <p> * For more fine-grained control over sanitization, use * {@link HtmlPolicyBuilder}. * </p> * * @author Mike Samuel <mikesamuel@gmail.com> */ public final class Sanitizers { /** * Allows common formatting elements including {@code <b>}, {@code <i>}, etc. */ public static final PolicyFactory FORMATTING = new HtmlPolicyBuilder() .allowCommonInlineFormattingElements().toFactory(); /** * Allows common block elements including <code>&lt;p&gt;</code>, * <code>&lt;h1&gt;</code>, etc. */ public static final PolicyFactory BLOCKS = new HtmlPolicyBuilder() .allowCommonBlockElements().toFactory(); /** * Allows certain safe CSS properties in {@code style="..."} attributes. */ public static final PolicyFactory STYLES = new HtmlPolicyBuilder() .allowStyling().toFactory(); /** * Allows HTTP, HTTPS, MAILTO, and relative links. */ public static final PolicyFactory LINKS = new HtmlPolicyBuilder() .allowStandardUrlProtocols().allowElements("a") .allowAttributes("href").onElements("a").requireRelNofollowOnLinks() .toFactory(); private static final AttributePolicy INTEGER = new AttributePolicy() { public String apply(String elementName, String attributeName, String value) { int n = value.length(); if (n == 0) { return null; } for (int i = 0; i < n; ++i) { char ch = value.charAt(i); if (ch == '.') { if (i == 0) { return null; } return value.substring(0, i); // truncate to integer. } else if (!('0' <= ch && ch <= '9')) { return null; } } return value; } }; /** * Allows {@code <img>} elements from HTTP, HTTPS, and relative sources. */ public static final PolicyFactory IMAGES = new HtmlPolicyBuilder() .allowUrlProtocols("http", "https").allowElements("img") .allowAttributes("alt", "src").onElements("img") .allowAttributes("border", "height", "width").matching(INTEGER) .onElements("img") .toFactory(); private Sanitizers() { // Uninstantiable. } } �����������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/HtmlStreamEventReceiver.java���������0000664�0001750�0001750�00000003654�11654053470�032432� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.List; /** * A light-weight SAX-like listener for HTML. */ public interface HtmlStreamEventReceiver { public void openDocument(); public void closeDocument(); /** * @param attrs alternating attribute names and values. */ public void openTag(String elementName, List<String> attrs); public void closeTag(String elementName); public void text(String text); } ������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/StylingPolicy.java�������������������0000664�0001750�0001750�00000053567�11654053470�030504� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; /** * An HTML sanitizer policy that tries to preserve simple CSS by converting it * to {@code <font>} tags which allow fewer ways to embed JavaScript. */ @TCB class StylingPolicy extends ElementAndAttributePolicyBasedSanitizerPolicy { StylingPolicy( HtmlStreamEventReceiver out, ImmutableMap<String, ElementAndAttributePolicies> elAndAttrPolicies) { super(out, elAndAttrPolicies); } @Override public void openTag(String elementName, List<String> attrs) { // Parts of the superclass method are repeated here, so if you change this, // be sure to check the super-class. String style = null; for (Iterator<String> it = attrs.iterator(); it.hasNext();) { String name = it.next(); if ("style".equals(name)) { style = it.next(); break; } else { it.next(); } } ElementAndAttributePolicies policies = elAndAttrPolicies.get(elementName); String adjustedElementName = applyPolicies(elementName, attrs, policies); if (adjustedElementName != null) { List<String> fontAttributes = null; if (style != null) { fontAttributes = cssPropertiesToFontAttributes(style); if (fontAttributes.isEmpty()) { fontAttributes = null; } } // If we have something to output, emit it. if (!(attrs.isEmpty() && policies.skipIfEmpty && fontAttributes == null)) { skipText = false; writeOpenTag(policies, adjustedElementName, attrs); if (fontAttributes != null) { synthesizeOpenTag("font", fontAttributes); // Rely on the tag balancer to close it. } return; } } deferOpenTag(elementName); } /** Used to track CSS property names while processing CSS. */ private enum CssPropertyType { FONT, FACE, SIZE, BACKGROUND_COLOR, COLOR, DIRECTION, UNICODE_BIDI, ALIGN, WEIGHT, STYLE, TEXT_DECORATION, NONE, ; } private static final Pattern ALLOWED_CSS_SIZE = Pattern.compile( "medium|(?:small|large)r|(?:xx?-)(?:small|large)|[0-9]+(p[tx]|%)"); private static final Pattern ALLOWED_CSS_WEIGHT = Pattern.compile( "normal|bold(?:er)?|lighter|[1-9]00"); private static final Set<String> ALLOWED_CSS_STYLE = ImmutableSet.of( "italic", "oblique", "normal"); private static final Set<String> ALLOWED_TEXT_DECORATION = ImmutableSet.of( "underline", "overline", "line-through"); private static final Set<String> ALLOWED_UNICODE_BIDI = ImmutableSet.of( "inherit", "normal", "embed", "bidi-override"); private static final Set<String> ALLOWED_DIRECTION = ImmutableSet.of( "inherit", "ltr", "rtl"); private static final ImmutableMap<String, CssPropertyType> BY_CSS_PROPERTY_NAME = ImmutableMap.<String, CssPropertyType>builder() .put("font", CssPropertyType.FONT) .put("font-family", CssPropertyType.FACE) .put("font-size", CssPropertyType.SIZE) .put("color", CssPropertyType.COLOR) .put("text-align", CssPropertyType.ALIGN) .put("direction", CssPropertyType.DIRECTION) .put("font-weight", CssPropertyType.WEIGHT) .put("font-style", CssPropertyType.STYLE) .put("text-decoration", CssPropertyType.TEXT_DECORATION) .put("unicode-bidi", CssPropertyType.UNICODE_BIDI) .put("background", CssPropertyType.BACKGROUND_COLOR) .put("background-color", CssPropertyType.BACKGROUND_COLOR) .build(); /** * Lossy conversion from CSS properties into the attributes of a * <code>&lt;font&gt;</code> tag that allows textual styling that affects * layout, but does not allow breaking out of a clipping region, absolute * positioning, image loading, tab index changes, or code execution. * * @return A list of alternating attribute names and values. */ @VisibleForTesting static List<String> cssPropertiesToFontAttributes(String style) { // We walk over CSS tokens to extract salient bits. class StyleExtractor implements CssGrammar.PropertyHandler { CssPropertyType type = CssPropertyType.NONE; // Values that are not-whitelisted are put into font attributes to render // the innocuous. StringBuilder face, color, backgroundColor; String align; // These values are white-listed so we know they can't affect anything // other than font-face appearance, and layout. String cssSize, cssWeight, cssFontStyle, cssTextDecoration; // Bidi support styles. String cssDir, cssUnicodeBidi; public void url(String token) { // Ignore. } public void startProperty(String propertyName) { CssPropertyType type = BY_CSS_PROPERTY_NAME.get(propertyName); this.type = type != null ? type : CssPropertyType.NONE; } public void quotedString(String token) { switch (type) { case FONT: case FACE: if (face == null) { face = new StringBuilder(); } face.append(' ').append(CssGrammar.cssContent(token)); break; default: break; } } public void quantity(String token) { switch (type) { case FONT: case SIZE: token = Strings.toLowerCase(token); if (ALLOWED_CSS_SIZE.matcher(token).matches()) { cssSize = token; } break; case FACE: if (face == null) { face = new StringBuilder(); } face.append(' ').append(token); break; case BACKGROUND_COLOR: if (backgroundColor == null) { backgroundColor = new StringBuilder(); } else { backgroundColor.append(' '); } backgroundColor.append(token); break; case COLOR: if (color == null) { color = new StringBuilder(); } else { color.append(' '); } color.append(token); break; case WEIGHT: if (ALLOWED_CSS_WEIGHT.matcher(token).matches()) { cssWeight = token; } break; default: break; } } public void identifierOrHash(String token) { switch (type) { case SIZE: token = Strings.toLowerCase(token); if (ALLOWED_CSS_SIZE.matcher(token).matches()) { cssSize = token; } break; case WEIGHT: token = Strings.toLowerCase(token); if (ALLOWED_CSS_WEIGHT.matcher(token).matches()) { cssWeight = token; } break; case FACE: if (face == null) { face = new StringBuilder(); } face.append(' ').append(token); break; case FONT: token = Strings.toLowerCase(token); if (ALLOWED_CSS_WEIGHT.matcher(token).matches()) { cssWeight = token; } else if (ALLOWED_CSS_SIZE.matcher(token).matches()) { cssSize = token; } else if (ALLOWED_CSS_STYLE.contains(token)) { cssFontStyle = token; } else { if (face == null) { face = new StringBuilder(); } face.append(' ').append(token); } break; case BACKGROUND_COLOR: if (backgroundColor == null) { backgroundColor = new StringBuilder(); backgroundColor.append(token); } break; case COLOR: if (color == null) { color = new StringBuilder(); color.append(token); } break; case STYLE: token = Strings.toLowerCase(token); if (ALLOWED_CSS_STYLE.contains(token)) { cssFontStyle = token; } break; case ALIGN: align = token; break; case DIRECTION: token = Strings.toLowerCase(token); if (ALLOWED_DIRECTION.contains(token)) { cssDir = token; } break; case UNICODE_BIDI: token = Strings.toLowerCase(token); if (ALLOWED_UNICODE_BIDI.contains(token)) { cssUnicodeBidi = token; } break; case TEXT_DECORATION: token = Strings.toLowerCase(token); if (ALLOWED_TEXT_DECORATION.contains(token)) { cssTextDecoration = token; } break; default: break; } } public void punctuation(String token) { switch (type) { case FACE: case FONT: // Commas separate font-families since HTML fonts fall-back to // simpler forms based on the installed font-set. if (",".equals(token) && face != null) { face.append(','); } break; case BACKGROUND_COLOR: // Parentheses and commas in the rgb(...) color form. if (backgroundColor != null) { backgroundColor.append(token); } break; case COLOR: // Parentheses and commas in the rgb(...) color form. if (color != null) { color.append(token); } break; default: break; } } public void endProperty() { type = CssPropertyType.NONE; } @TCB List<String> toFontAttributes() { List<String> fontAttributes = Lists.newArrayList(); if (face != null) { fontAttributes.add("face"); fontAttributes.add(face.toString().trim()); } if (align != null) { fontAttributes.add("align"); fontAttributes.add(align); } ImmutableList<String> styleParts; { ImmutableList.Builder<String> b = ImmutableList.builder(); if (cssWeight != null) { b.add("font-weight").add(cssWeight); } if (cssSize != null) { b.add("font-size").add(cssSize); } if (cssFontStyle != null) { b.add("font-style").add(cssFontStyle); } if (cssTextDecoration != null) { b.add("text-decoration").add(cssTextDecoration); } if (cssDir != null) { b.add("direction").add(cssDir); } if (cssUnicodeBidi != null) { b.add("unicode-bidi").add(cssUnicodeBidi); } if (backgroundColor != null) { String safeColor = sanitizeColor(backgroundColor.toString()); if (safeColor != null) { b.add("background-color").add(safeColor); } } if (color != null) { String safeColor = sanitizeColor(color.toString()); if (safeColor != null) { b.add("color").add(safeColor); } } styleParts = b.build(); } if (!styleParts.isEmpty()) { StringBuilder cssProperties = new StringBuilder(); boolean isPropertyName = true; for (String stylePart : styleParts) { cssProperties.append(stylePart).append(isPropertyName ? ':' : ';'); isPropertyName = !isPropertyName; } int len = cssProperties.length(); if (len != 0 && cssProperties.charAt(len - 1) == ';') { cssProperties.setLength(len - 1); } fontAttributes.add("style"); fontAttributes.add(cssProperties.toString()); } return fontAttributes; } } StyleExtractor extractor = new StyleExtractor(); CssGrammar.asPropertyGroup(style, extractor); return extractor.toFontAttributes(); } /** * Converts the various CSS syntactic forms for colors to a hex value or null. * If the input is not a valid CSS color expression, then this method either * returns null or returns a valid CSS hash color but the particular hash * color is not well specified (besides being deterministic). */ static String sanitizeColor(String s) { if (s.length() == 0) { return null; } s = Strings.toLowerCase(s); String hex = COLOR_TABLE.get(s); if (hex != null) { return hex; } int n = s.length(); if (s.charAt(0) == '#') { if (n != 4 && n != 7) { return null; } for (int i = 1; i < n; ++i) { char ch = s.charAt(i); if (!(('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f'))) { return null; } } return s; } // Handle rgb and rgba if (!s.startsWith("rgb")) { return null; } StringBuilder sb = new StringBuilder(7); sb.append('#'); if (translateDecimalOrPctByteToHex( s, translateDecimalOrPctByteToHex( s, translateDecimalOrPctByteToHex(s, 3, sb), sb), sb) == -1) { return null; } // #aabbcc -> #abc if (sb.charAt(1) == sb.charAt(2) && sb.charAt(3) == sb.charAt(4) && sb.charAt(5) == sb.charAt(6)) { sb.setCharAt(2, sb.charAt(3)); sb.setCharAt(3, sb.charAt(5)); sb.setLength(4); } return sb.toString(); } private static boolean isDecimalDigit(char ch) { return '0' <= ch && ch <= '9'; } /** * Looks for a decimal number in the range 0-255 or a percentage into a * hex pair written to out. Returns the index after the number. */ private static int translateDecimalOrPctByteToHex( String s, int i, StringBuilder out) { if (i == -1) { return -1; } int n = s.length(); for (; i < n; ++i) { char ch = s.charAt(i); // Look for the first digit. if (isDecimalDigit(ch) || ch == '.') { int value; if (ch != '.') { value = ch - '0'; // Reduce the run of digits to a decimal number. while (++i < n) { ch = s.charAt(i); if (isDecimalDigit(ch)) { value = value * 10 + (ch - '0'); } else { break; } } } else { value = 0; } float fraction = 0; if (s.charAt(i) == '.') { int numerator = 0; int denominator = 1; // Consume any decimal portion. // TODO: Maybe incorporate into value. while (++i < n) { ch = s.charAt(i); if (!isDecimalDigit(ch)) { break; } numerator = numerator * 10 + (ch - '0'); denominator *= 10; } fraction = ((float) numerator) / denominator; } // Convert the decimal number to a percentage if appropriate. if (i < n && s.charAt(i) == '%') { // TODO: is this the right rounding mode? value = (int) Math.round((value + fraction) * 2.55); ++i; } else if (value < 0xff && fraction > 0.5) { ++value; } if (0 <= value && value <= 0xff) { out.append("0123456789abcdef".charAt(value >>> 4)) .append("0123456789abcdef".charAt(value & 0xf)); return i; } return -1; } } return -1; } /** Maps CSS3 color keywords to unambiguous hash values. */ private static final ImmutableMap<String, String> COLOR_TABLE = ImmutableMap.<String, String>builder() .put("aliceblue", "#f0f8ff") .put("antiquewhite", "#faebd7") .put("aqua", "#0ff") .put("aquamarine", "#7fffd4") .put("azure", "#f0ffff") .put("beige", "#f5f5dc") .put("bisque", "#ffe4c4") .put("black", "#000") .put("blanchedalmond", "#ffebcd") .put("blue", "#00f") .put("blueviolet", "#8a2be2") .put("brown", "#a52a2a") .put("burlywood", "#deb887") .put("cadetblue", "#5f9ea0") .put("chartreuse", "#7fff00") .put("chocolate", "#d2691e") .put("coral", "#ff7f50") .put("cornflowerblue", "#6495ed") .put("cornsilk", "#fff8dc") .put("crimson", "#dc143c") .put("cyan", "#0ff") .put("darkblue", "#00008b") .put("darkcyan", "#008b8b") .put("darkgoldenrod", "#b8860b") .put("darkgray", "#a9a9a9") .put("darkgreen", "#006400") .put("darkgrey", "#a9a9a9") .put("darkkhaki", "#bdb76b") .put("darkmagenta", "#8b008b") .put("darkolivegreen", "#556b2f") .put("darkorange", "#ff8c00") .put("darkorchid", "#9932cc") .put("darkred", "#8b0000") .put("darksalmon", "#e9967a") .put("darkseagreen", "#8fbc8f") .put("darkslateblue", "#483d8b") .put("darkslategray", "#2f4f4f") .put("darkslategrey", "#2f4f4f") .put("darkturquoise", "#00ced1") .put("darkviolet", "#9400d3") .put("deeppink", "#ff1493") .put("deepskyblue", "#00bfff") .put("dimgray", "#696969") .put("dimgrey", "#696969") .put("dodgerblue", "#1e90ff") .put("firebrick", "#b22222") .put("floralwhite", "#fffaf0") .put("forestgreen", "#228b22") .put("fuchsia", "#f0f") .put("gainsboro", "#dcdcdc") .put("ghostwhite", "#f8f8ff") .put("gold", "#ffd700") .put("goldenrod", "#daa520") .put("gray", "#808080") .put("green", "#008000") .put("greenyellow", "#adff2f") .put("grey", "#808080") .put("honeydew", "#f0fff0") .put("hotpink", "#ff69b4") .put("indianred", "#cd5c5c") .put("indigo", "#4b0082") .put("ivory", "#fffff0") .put("khaki", "#f0e68c") .put("lavender", "#e6e6fa") .put("lavenderblush", "#fff0f5") .put("lawngreen", "#7cfc00") .put("lemonchiffon", "#fffacd") .put("lightblue", "#add8e6") .put("lightcoral", "#f08080") .put("lightcyan", "#e0ffff") .put("lightgoldenrodyellow", "#fafad2") .put("lightgray", "#d3d3d3") .put("lightgreen", "#90ee90") .put("lightgrey", "#d3d3d3") .put("lightpink", "#ffb6c1") .put("lightsalmon", "#ffa07a") .put("lightseagreen", "#20b2aa") .put("lightskyblue", "#87cefa") .put("lightslategray", "#789") .put("lightslategrey", "#789") .put("lightsteelblue", "#b0c4de") .put("lightyellow", "#ffffe0") .put("lime", "#0f0") .put("limegreen", "#32cd32") .put("linen", "#faf0e6") .put("magenta", "#f0f") .put("maroon", "#800000") .put("mediumaquamarine", "#66cdaa") .put("mediumblue", "#0000cd") .put("mediumorchid", "#ba55d3") .put("mediumpurple", "#9370db") .put("mediumseagreen", "#3cb371") .put("mediumslateblue", "#7b68ee") .put("mediumspringgreen", "#00fa9a") .put("mediumturquoise", "#48d1cc") .put("mediumvioletred", "#c71585") .put("midnightblue", "#191970") .put("mintcream", "#f5fffa") .put("mistyrose", "#ffe4e1") .put("moccasin", "#ffe4b5") .put("navajowhite", "#ffdead") .put("navy", "#000080") .put("oldlace", "#fdf5e6") .put("olive", "#808000") .put("olivedrab", "#6b8e23") .put("orange", "#ffa500") .put("orangered", "#ff4500") .put("orchid", "#da70d6") .put("palegoldenrod", "#eee8aa") .put("palegreen", "#98fb98") .put("paleturquoise", "#afeeee") .put("palevioletred", "#db7093") .put("papayawhip", "#ffefd5") .put("peachpuff", "#ffdab9") .put("peru", "#cd853f") .put("pink", "#ffc0cb") .put("plum", "#dda0dd") .put("powderblue", "#b0e0e6") .put("purple", "#800080") .put("red", "#f00") .put("rosybrown", "#bc8f8f") .put("royalblue", "#4169e1") .put("saddlebrown", "#8b4513") .put("salmon", "#fa8072") .put("sandybrown", "#f4a460") .put("seagreen", "#2e8b57") .put("seashell", "#fff5ee") .put("sienna", "#a0522d") .put("silver", "#c0c0c0") .put("skyblue", "#87ceeb") .put("slateblue", "#6a5acd") .put("slategray", "#708090") .put("slategrey", "#708090") .put("snow", "#fffafa") .put("springgreen", "#00ff7f") .put("steelblue", "#4682b4") .put("tan", "#d2b48c") .put("teal", "#008080") .put("thistle", "#d8bfd8") .put("tomato", "#ff6347") .put("turquoise", "#40e0d0") .put("violet", "#ee82ee") .put("wheat", "#f5deb3") .put("white", "#fff") .put("whitesmoke", "#f5f5f5") .put("yellow", "#ff0") .put("yellowgreen", "#9acd32") .build(); } �����������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/HtmlEntities.java��������������������0000664�0001750�0001750�00000047064�11654053470�030277� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import com.google.common.collect.ImmutableMap; /** * Utilities for decoding HTML entities, e.g., {@code &amp;}. */ class HtmlEntities { /** * Decodes any HTML entity at the given location. This handles both named and * numeric entities. * * @param html HTML text. * @param offset the position of the sequence to decode. * @param limit the last position in chars that could be part of the sequence * to decode. * @return The offset after the end of the decoded sequence and the decoded * code-point or code-unit packed into a long. * The first 32 bits are the offset, and the second 32 bits are a * code-point or a code-unit. */ public static long decodeEntityAt(String html, int offset, int limit) { char ch = html.charAt(offset); if ('&' != ch) { return ((offset + 1L) << 32) | ch; } int entityLimit = Math.min(limit, offset + 10); int end = -1; int tail = -1; if (entityLimit == limit) { // Assume a broken entity that ends at the end until shown otherwise. end = tail = entityLimit; } entityloop: for (int i = offset + 1; i < entityLimit; ++i) { switch (html.charAt(i)) { case ';': // An unbroken entity. end = i; tail = end + 1; break entityloop; case '#': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; case '=': // An equal sign after an entity missing a closing semicolon should // never have the semicolon inserted since that causes trouble with // parameters in partially encoded URLs. return ((offset + 1L) << 32) | '&'; default: // A possible broken entity. end = i; tail = i; break entityloop; } } if (end < 0 || offset + 2 >= end) { return ((offset + 1L) << 32) | '&'; } // Now we know where the entity ends, and that there is at least one // character in the entity name char ch1 = html.charAt(offset + 1); char ch2 = html.charAt(offset + 2); int codepoint = -1; if ('#' == ch1) { // numeric entity if ('x' == ch2 || 'X' == ch2) { if (end == offset + 3) { // No digits return ((offset + 1L) << 32) | '&'; } codepoint = 0; // hex literal digloop: for (int i = offset + 3; i < end; ++i) { char digit = html.charAt(i); switch (digit & 0xfff8) { case 0x30: case 0x38: // ASCII 48-57 are '0'-'9' int decDig = digit & 0xf; if (decDig < 10) { codepoint = (codepoint << 4) | decDig; } else { codepoint = -1; break digloop; } break; // ASCII 65-70 and 97-102 are 'A'-'Z' && 'a'-'z' case 0x40: case 0x60: int hexDig = (digit & 0x7); if (hexDig != 0 && hexDig < 7) { codepoint = (codepoint << 4) | (hexDig + 9); } else { codepoint = -1; break digloop; } break; default: codepoint = -1; break digloop; } } if (codepoint > Character.MAX_CODE_POINT) { codepoint = 0xfffd; // Unknown. } } else { codepoint = 0; // decimal literal digloop: for (int i = offset + 2; i < end; ++i) { char digit = html.charAt(i); switch (digit & 0xfff8) { case 0x30: case 0x38: // ASCII 48-57 are '0'-'9' int decDig = digit - '0'; if (decDig < 10) { codepoint = (codepoint * 10) + decDig; } else { codepoint = -1; break digloop; } break; default: codepoint = -1; break digloop; } } if (codepoint > Character.MAX_CODE_POINT) { codepoint = 0xfffd; // Unknown. } } } else { Trie t = ENTITY_TRIE; for (int i = offset + 1; i < end; ++i) { char nameChar = html.charAt(i); t = t.lookup(nameChar); if (t == null) { break; } } if (t == null) { t = ENTITY_TRIE; for (int i = offset + 1; i < end; ++i) { char nameChar = html.charAt(i); if ('Z' >= nameChar && nameChar >= 'A') { nameChar |= 32; } t = t.lookup(nameChar); if (t == null) { break; } } } if (t != null && t.isTerminal()) { codepoint = t.getValue(); } } if (codepoint < 0) { return ((offset + 1L) << 32) | '&'; } else { return (((long) tail) << 32) | codepoint; } } /** A possible entity name like "amp" or "gt". */ public static boolean isEntityName(String name) { Trie t = ENTITY_TRIE; int n = name.length(); // Treat AMP the same amp, but not Amp. boolean isUcase = true; for (int i = 0; i < n; ++i) { char ch = name.charAt(i); if (!('A' <= ch && ch <= 'Z')) { isUcase = false; break; } } if (isUcase) { name = Strings.toLowerCase(name); } for (int i = 0; i < n; ++i) { t = t.lookup(name.charAt(i)); if (t == null) { return false; } } return t.isTerminal(); } /** A trie that maps entity names to codepoints. */ public static final Trie ENTITY_TRIE = new Trie( ImmutableMap.<String, Integer>builder() // C0 Controls and Basic Latin .put("quot", Integer.valueOf('"')) .put("amp", Integer.valueOf('&')) .put("lt", Integer.valueOf('<')) .put("gt", Integer.valueOf('>')) // XML 1.0 .put("apos", Integer.valueOf('\'')) // HTML4 entities .put("nbsp", Integer.valueOf('\u00a0')) .put("iexcl", Integer.valueOf('\u00a1')) .put("cent", Integer.valueOf('\u00a2')) .put("pound", Integer.valueOf('\u00a3')) .put("curren", Integer.valueOf('\u00a4')) .put("yen", Integer.valueOf('\u00a5')) .put("brvbar", Integer.valueOf('\u00a6')) .put("sect", Integer.valueOf('\u00a7')) .put("uml", Integer.valueOf('\u00a8')) .put("copy", Integer.valueOf('\u00a9')) .put("ordf", Integer.valueOf('\u00aa')) .put("laquo", Integer.valueOf('\u00ab')) .put("not", Integer.valueOf('\u00ac')) .put("shy", Integer.valueOf('\u00ad')) .put("reg", Integer.valueOf('\u00ae')) .put("macr", Integer.valueOf('\u00af')) .put("deg", Integer.valueOf('\u00b0')) .put("plusmn", Integer.valueOf('\u00b1')) .put("sup2", Integer.valueOf('\u00b2')) .put("sup3", Integer.valueOf('\u00b3')) .put("acute", Integer.valueOf('\u00b4')) .put("micro", Integer.valueOf('\u00b5')) .put("para", Integer.valueOf('\u00b6')) .put("middot", Integer.valueOf('\u00b7')) .put("cedil", Integer.valueOf('\u00b8')) .put("sup1", Integer.valueOf('\u00b9')) .put("ordm", Integer.valueOf('\u00ba')) .put("raquo", Integer.valueOf('\u00bb')) .put("frac14", Integer.valueOf('\u00bc')) .put("frac12", Integer.valueOf('\u00bd')) .put("frac34", Integer.valueOf('\u00be')) .put("iquest", Integer.valueOf('\u00bf')) .put("Agrave", Integer.valueOf('\u00c0')) .put("Aacute", Integer.valueOf('\u00c1')) .put("Acirc", Integer.valueOf('\u00c2')) .put("Atilde", Integer.valueOf('\u00c3')) .put("Auml", Integer.valueOf('\u00c4')) .put("Aring", Integer.valueOf('\u00c5')) .put("AElig", Integer.valueOf('\u00c6')) .put("Ccedil", Integer.valueOf('\u00c7')) .put("Egrave", Integer.valueOf('\u00c8')) .put("Eacute", Integer.valueOf('\u00c9')) .put("Ecirc", Integer.valueOf('\u00ca')) .put("Euml", Integer.valueOf('\u00cb')) .put("Igrave", Integer.valueOf('\u00cc')) .put("Iacute", Integer.valueOf('\u00cd')) .put("Icirc", Integer.valueOf('\u00ce')) .put("Iuml", Integer.valueOf('\u00cf')) .put("ETH", Integer.valueOf('\u00d0')) .put("Ntilde", Integer.valueOf('\u00d1')) .put("Ograve", Integer.valueOf('\u00d2')) .put("Oacute", Integer.valueOf('\u00d3')) .put("Ocirc", Integer.valueOf('\u00d4')) .put("Otilde", Integer.valueOf('\u00d5')) .put("Ouml", Integer.valueOf('\u00d6')) .put("times", Integer.valueOf('\u00d7')) .put("Oslash", Integer.valueOf('\u00d8')) .put("Ugrave", Integer.valueOf('\u00d9')) .put("Uacute", Integer.valueOf('\u00da')) .put("Ucirc", Integer.valueOf('\u00db')) .put("Uuml", Integer.valueOf('\u00dc')) .put("Yacute", Integer.valueOf('\u00dd')) .put("THORN", Integer.valueOf('\u00de')) .put("szlig", Integer.valueOf('\u00df')) .put("agrave", Integer.valueOf('\u00e0')) .put("aacute", Integer.valueOf('\u00e1')) .put("acirc", Integer.valueOf('\u00e2')) .put("atilde", Integer.valueOf('\u00e3')) .put("auml", Integer.valueOf('\u00e4')) .put("aring", Integer.valueOf('\u00e5')) .put("aelig", Integer.valueOf('\u00e6')) .put("ccedil", Integer.valueOf('\u00e7')) .put("egrave", Integer.valueOf('\u00e8')) .put("eacute", Integer.valueOf('\u00e9')) .put("ecirc", Integer.valueOf('\u00ea')) .put("euml", Integer.valueOf('\u00eb')) .put("igrave", Integer.valueOf('\u00ec')) .put("iacute", Integer.valueOf('\u00ed')) .put("icirc", Integer.valueOf('\u00ee')) .put("iuml", Integer.valueOf('\u00ef')) .put("eth", Integer.valueOf('\u00f0')) .put("ntilde", Integer.valueOf('\u00f1')) .put("ograve", Integer.valueOf('\u00f2')) .put("oacute", Integer.valueOf('\u00f3')) .put("ocirc", Integer.valueOf('\u00f4')) .put("otilde", Integer.valueOf('\u00f5')) .put("ouml", Integer.valueOf('\u00f6')) .put("divide", Integer.valueOf('\u00f7')) .put("oslash", Integer.valueOf('\u00f8')) .put("ugrave", Integer.valueOf('\u00f9')) .put("uacute", Integer.valueOf('\u00fa')) .put("ucirc", Integer.valueOf('\u00fb')) .put("uuml", Integer.valueOf('\u00fc')) .put("yacute", Integer.valueOf('\u00fd')) .put("thorn", Integer.valueOf('\u00fe')) .put("yuml", Integer.valueOf('\u00ff')) // Latin Extended-B .put("fnof", Integer.valueOf('\u0192')) // Greek .put("Alpha", Integer.valueOf('\u0391')) .put("Beta", Integer.valueOf('\u0392')) .put("Gamma", Integer.valueOf('\u0393')) .put("Delta", Integer.valueOf('\u0394')) .put("Epsilon", Integer.valueOf('\u0395')) .put("Zeta", Integer.valueOf('\u0396')) .put("Eta", Integer.valueOf('\u0397')) .put("Theta", Integer.valueOf('\u0398')) .put("Iota", Integer.valueOf('\u0399')) .put("Kappa", Integer.valueOf('\u039a')) .put("Lambda", Integer.valueOf('\u039b')) .put("Mu", Integer.valueOf('\u039c')) .put("Nu", Integer.valueOf('\u039d')) .put("Xi", Integer.valueOf('\u039e')) .put("Omicron", Integer.valueOf('\u039f')) .put("Pi", Integer.valueOf('\u03a0')) .put("Rho", Integer.valueOf('\u03a1')) .put("Sigma", Integer.valueOf('\u03a3')) .put("Tau", Integer.valueOf('\u03a4')) .put("Upsilon", Integer.valueOf('\u03a5')) .put("Phi", Integer.valueOf('\u03a6')) .put("Chi", Integer.valueOf('\u03a7')) .put("Psi", Integer.valueOf('\u03a8')) .put("Omega", Integer.valueOf('\u03a9')) .put("alpha", Integer.valueOf('\u03b1')) .put("beta", Integer.valueOf('\u03b2')) .put("gamma", Integer.valueOf('\u03b3')) .put("delta", Integer.valueOf('\u03b4')) .put("epsilon", Integer.valueOf('\u03b5')) .put("zeta", Integer.valueOf('\u03b6')) .put("eta", Integer.valueOf('\u03b7')) .put("theta", Integer.valueOf('\u03b8')) .put("iota", Integer.valueOf('\u03b9')) .put("kappa", Integer.valueOf('\u03ba')) .put("lambda", Integer.valueOf('\u03bb')) .put("mu", Integer.valueOf('\u03bc')) .put("nu", Integer.valueOf('\u03bd')) .put("xi", Integer.valueOf('\u03be')) .put("omicron", Integer.valueOf('\u03bf')) .put("pi", Integer.valueOf('\u03c0')) .put("rho", Integer.valueOf('\u03c1')) .put("sigmaf", Integer.valueOf('\u03c2')) .put("sigma", Integer.valueOf('\u03c3')) .put("tau", Integer.valueOf('\u03c4')) .put("upsilon", Integer.valueOf('\u03c5')) .put("phi", Integer.valueOf('\u03c6')) .put("chi", Integer.valueOf('\u03c7')) .put("psi", Integer.valueOf('\u03c8')) .put("omega", Integer.valueOf('\u03c9')) .put("thetasym", Integer.valueOf('\u03d1')) .put("upsih", Integer.valueOf('\u03d2')) .put("piv", Integer.valueOf('\u03d6')) // General Punctuation .put("bull", Integer.valueOf('\u2022')) .put("hellip", Integer.valueOf('\u2026')) .put("prime", Integer.valueOf('\u2032')) .put("Prime", Integer.valueOf('\u2033')) .put("oline", Integer.valueOf('\u203e')) .put("frasl", Integer.valueOf('\u2044')) // Letterlike Symbols .put("weierp", Integer.valueOf('\u2118')) .put("image", Integer.valueOf('\u2111')) .put("real", Integer.valueOf('\u211c')) .put("trade", Integer.valueOf('\u2122')) .put("alefsym", Integer.valueOf('\u2135')) // Arrows .put("larr", Integer.valueOf('\u2190')) .put("uarr", Integer.valueOf('\u2191')) .put("rarr", Integer.valueOf('\u2192')) .put("darr", Integer.valueOf('\u2193')) .put("harr", Integer.valueOf('\u2194')) .put("crarr", Integer.valueOf('\u21b5')) .put("lArr", Integer.valueOf('\u21d0')) .put("uArr", Integer.valueOf('\u21d1')) .put("rArr", Integer.valueOf('\u21d2')) .put("dArr", Integer.valueOf('\u21d3')) .put("hArr", Integer.valueOf('\u21d4')) // Mathematical Operators .put("forall", Integer.valueOf('\u2200')) .put("part", Integer.valueOf('\u2202')) .put("exist", Integer.valueOf('\u2203')) .put("empty", Integer.valueOf('\u2205')) .put("nabla", Integer.valueOf('\u2207')) .put("isin", Integer.valueOf('\u2208')) .put("notin", Integer.valueOf('\u2209')) .put("ni", Integer.valueOf('\u220b')) .put("prod", Integer.valueOf('\u220f')) .put("sum", Integer.valueOf('\u2211')) .put("minus", Integer.valueOf('\u2212')) .put("lowast", Integer.valueOf('\u2217')) .put("radic", Integer.valueOf('\u221a')) .put("prop", Integer.valueOf('\u221d')) .put("infin", Integer.valueOf('\u221e')) .put("ang", Integer.valueOf('\u2220')) .put("and", Integer.valueOf('\u2227')) .put("or", Integer.valueOf('\u2228')) .put("cap", Integer.valueOf('\u2229')) .put("cup", Integer.valueOf('\u222a')) .put("int", Integer.valueOf('\u222b')) .put("there4", Integer.valueOf('\u2234')) .put("sim", Integer.valueOf('\u223c')) .put("cong", Integer.valueOf('\u2245')) .put("asymp", Integer.valueOf('\u2248')) .put("ne", Integer.valueOf('\u2260')) .put("equiv", Integer.valueOf('\u2261')) .put("le", Integer.valueOf('\u2264')) .put("ge", Integer.valueOf('\u2265')) .put("sub", Integer.valueOf('\u2282')) .put("sup", Integer.valueOf('\u2283')) .put("nsub", Integer.valueOf('\u2284')) .put("sube", Integer.valueOf('\u2286')) .put("supe", Integer.valueOf('\u2287')) .put("oplus", Integer.valueOf('\u2295')) .put("otimes", Integer.valueOf('\u2297')) .put("perp", Integer.valueOf('\u22a5')) .put("sdot", Integer.valueOf('\u22c5')) // Miscellaneous Technical .put("lceil", Integer.valueOf('\u2308')) .put("rceil", Integer.valueOf('\u2309')) .put("lfloor", Integer.valueOf('\u230a')) .put("rfloor", Integer.valueOf('\u230b')) .put("lang", Integer.valueOf('\u2329')) .put("rang", Integer.valueOf('\u232a')) // Geometric Shapes .put("loz", Integer.valueOf('\u25ca')) // Miscellaneous Symbols .put("spades", Integer.valueOf('\u2660')) .put("clubs", Integer.valueOf('\u2663')) .put("hearts", Integer.valueOf('\u2665')) .put("diams", Integer.valueOf('\u2666')) // Latin Extended-A .put("OElig", Integer.valueOf('\u0152')) .put("oelig", Integer.valueOf('\u0153')) .put("Scaron", Integer.valueOf('\u0160')) .put("scaron", Integer.valueOf('\u0161')) .put("Yuml", Integer.valueOf('\u0178')) // Spacing Modifier Letters .put("circ", Integer.valueOf('\u02c6')) .put("tilde", Integer.valueOf('\u02dc')) // General Punctuation .put("ensp", Integer.valueOf('\u2002')) .put("emsp", Integer.valueOf('\u2003')) .put("thinsp", Integer.valueOf('\u2009')) .put("zwnj", Integer.valueOf('\u200c')) .put("zwj", Integer.valueOf('\u200d')) .put("lrm", Integer.valueOf('\u200e')) .put("rlm", Integer.valueOf('\u200f')) .put("ndash", Integer.valueOf('\u2013')) .put("mdash", Integer.valueOf('\u2014')) .put("lsquo", Integer.valueOf('\u2018')) .put("rsquo", Integer.valueOf('\u2019')) .put("sbquo", Integer.valueOf('\u201a')) .put("ldquo", Integer.valueOf('\u201c')) .put("rdquo", Integer.valueOf('\u201d')) .put("bdquo", Integer.valueOf('\u201e')) .put("dagger", Integer.valueOf('\u2020')) .put("Dagger", Integer.valueOf('\u2021')) .put("permil", Integer.valueOf('\u2030')) .put("lsaquo", Integer.valueOf('\u2039')) .put("rsaquo", Integer.valueOf('\u203a')) .put("euro", Integer.valueOf('\u20ac')) .build()); private HtmlEntities() { /* uninstantiable */ } } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/HtmlPolicyBuilder.java���������������0000664�0001750�0001750�00000062403�11654053470�031253� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import javax.annotation.Nullable; import javax.annotation.concurrent.NotThreadSafe; import com.google.common.base.Predicate; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; import com.google.common.collect.Sets; /** * Conveniences for configuring policies for the {@link HtmlSanitizer}. * * <h3>Usage</h3> * <p> * To create a policy, first construct an instance of this class; then call * <code>allow&hellip;</code> methods to turn on tags, attributes, and other * processing modes; and finally call <code>build()</code> or * <code>toFactory()</code>. * </p> * <pre class="prettyprint lang-java"> * // Define the policy. * Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy> policyDefinition * = new HtmlPolicyBuilder() * .allowElements("a", "p") * .allowAttributes("href").onElements("a") * .toFactory(); * * // Sanitize your output. * HtmlSanitizer.sanitize(myHtml. policyDefinition.apply(myHtmlStreamRenderer)); * </pre> * * <h3>Embedded Content</h3> * <p> * Embedded URLs are filtered by * {@link HtmlPolicyBuilder#allowUrlProtocols protocol}. * There is a {@link HtmlPolicyBuilder#allowStandardUrlProtocols canned policy} * so you can easily white-list widely used policies that don't violate the * current pages origin. See "Customization" below for ways to do further * filtering. If you allow links it might be worthwhile to * {@link HtmlPolicyBuilder#requireRelNofollowOnLinks() require} * {@code rel=nofollow}. * </p> * <p> * This class simply throws out all embedded JS. * Use a custom element or attribute policy to allow through * signed or otherwise known-safe code. * Check out the Caja project if you need a way to contain third-party JS. * </p> * <p> * This class does not attempt to faithfully parse and sanitize CSS. * It does provide {@link HtmlPolicyBuilder#allowStyling() one} styling option * that allows through a few CSS properties that allow textual styling, but that * disallow image loading, history stealing, layout breaking, code execution, * etc. * </p> * * <h3>Customization</h3> * <p> * You can easily do custom processing on tags and attributes by supplying your * own {@link ElementPolicy element policy} or * {@link AttributePolicy attribute policy} when calling * <code>allow&hellip;</code>. * E.g. to convert headers into {@code <div>}s, you could use an element policy * </p> * <pre class="prettyprint lang-java"> * new HtmlPolicyBuilder * .allowElement( * new ElementPolicy() { * public String apply(String elementName, List<String> attributes) { * attributes.add("class"); * attributes.add("header-" + elementName); * return "div"; * } * }, * "h1", "h2", "h3", "h4", "h5", "h6") * .build(outputChannel) * </pre> * * <h3>Rules of Thumb</h3> * <p> * Throughout this class, several rules hold: * <ul> * <li>Everything is denied by default. There are * <code>disallow&hellip;</code> methods, but those reverse * allows instead of rolling back overly permissive defaults. * <li>The order of allows and disallows does not matter. * Disallows trump allows whether they occur before or after them. * The only method that needs to be called in a particular place is * {@link HtmlPolicyBuilder#build}. * Allows or disallows after {@code build} is called have no * effect on the already built policy. * <li>Element and attribute policies are applied in the following order: * element specific attribute policy, global attribute policy, element * policy. * Element policies come last so they can observe all the post-processed * attributes, and so they can add attributes that are exempt from * attribute policies. * Element specific policies go first, so they can normalize content to * a form that might be acceptable to a more simplistic global policy. * </ul> * * <h3>Thread safety and efficiency</h3> * <p> * This class is not thread-safe. The resulting policy will not violate its * security guarantees as a result of race conditions, but is not thread safe * because it maintains state to track whether text inside disallowed elements * should be suppressed. * <p> * The resulting policy can be reused, but if you use the * {@link HtmlPolicyBuilder#toFactory()} method instead of {@link #build}, then * binding policies to output channels is cheap so there's no need. * </p> * * @author Mike Samuel <mikesamuel@gmail.com> */ @TCB @NotThreadSafe public class HtmlPolicyBuilder { /** * The default set of elements that are removed if they have no attributes. * Since {@code <img>} is in this set, by default, a policy will remove * {@code <img src=javascript:alert(1337)>} because its URL is not allowed * and it has no other attributes that would warrant it appearing in the * output. */ public static final ImmutableSet<String> DEFAULT_SKIP_IF_EMPTY = ImmutableSet.of("a", "font", "img", "input", "span"); private final Map<String, ElementPolicy> elPolicies = Maps.newLinkedHashMap(); private final Map<String, Map<String, AttributePolicy>> attrPolicies = Maps.newLinkedHashMap(); private final Map<String, AttributePolicy> globalAttrPolicies = Maps.newLinkedHashMap(); private final Set<String> allowedProtocols = Sets.newLinkedHashSet(); private final Set<String> skipIfEmpty = Sets.newLinkedHashSet( DEFAULT_SKIP_IF_EMPTY); private boolean requireRelNofollowOnLinks, allowStyling; /** * Allows the named elements. */ public HtmlPolicyBuilder allowElements(String... elementNames) { return allowElements(ElementPolicy.IDENTITY_ELEMENT_POLICY, elementNames); } /** * Disallows the named elements. Elements are disallowed by default, so * there is no need to disallow elements, unless you are making an exception * based on an earlier allow. */ public HtmlPolicyBuilder disallowElements(String... elementNames) { return allowElements(ElementPolicy.REJECT_ALL_ELEMENT_POLICY, elementNames); } /** * Allow the given elements with the given policy. * * @param policy May remove or add attributes, change the element name, or * deny the element. */ public HtmlPolicyBuilder allowElements( ElementPolicy policy, String... elementNames) { invalidateCompiledState(); for (String elementName : elementNames) { elementName = HtmlLexer.canonicalName(elementName); ElementPolicy newPolicy = ElementPolicy.Util.join( elPolicies.get(elementName), policy); // Don't remove if newPolicy is the always reject policy since we want // that to infect later allowElement calls for this particular element // name. rejects should have higher priority than allows. elPolicies.put(elementName, newPolicy); } return this; } /** * A canned policy that allows a number of common formatting elements. */ public HtmlPolicyBuilder allowCommonInlineFormattingElements() { return allowElements( "b", "i", "font", "s", "u", "o", "sup", "sub", "ins", "del", "strong", "strike", "tt", "code", "big", "small", "br", "span"); } /** * A canned policy that allows a number of common block elements. */ public HtmlPolicyBuilder allowCommonBlockElements() { return allowElements( "p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "li", "blockquote"); } /** * Assuming the given elements are allowed, allows them to appear without * attributes. * * @see #DEFAULT_SKIP_IF_EMPTY * @see #disallowWithoutAttributes */ public HtmlPolicyBuilder allowWithoutAttributes(String... elementNames) { invalidateCompiledState(); for (String elementName : elementNames) { elementName = HtmlLexer.canonicalName(elementName); skipIfEmpty.remove(elementName); } return this; } /** * Disallows the given elements from appearing without attributes. * * @see #DEFAULT_SKIP_IF_EMPTY * @see #allowWithoutAttributes */ public HtmlPolicyBuilder disallowWithoutAttributes(String... elementNames) { invalidateCompiledState(); for (String elementName : elementNames) { elementName = HtmlLexer.canonicalName(elementName); skipIfEmpty.add(elementName); } return this; } /** * Returns an object that lets you associate policies with the given * attributes, and allow them globally or on specific elements. */ public AttributeBuilder allowAttributes(String... attributeNames) { ImmutableList.Builder<String> b = ImmutableList.builder(); for (String attributeName : attributeNames) { b.add(HtmlLexer.canonicalName(attributeName)); } return new AttributeBuilder(b.build()); } /** * Reverse an earlier attribute {@link #allowAttributes allow}. * <p> * For this to have an effect you must call at least one of * {@link AttributeBuilder#globally} and {@link AttributeBuilder#onElements}. * <p> * Attributes are disallowed by default, so there is no need to call this * with a laundry list of attribute/element pairs. */ public AttributeBuilder disallowAttributes(String... attributeNames) { return this.allowAttributes(attributeNames) .matching(AttributePolicy.REJECT_ALL_ATTRIBUTE_POLICY); } private HtmlPolicyBuilder allowAttributesGlobally( AttributePolicy policy, List<String> attributeNames) { invalidateCompiledState(); for (String attributeName : attributeNames) { // We reinterpret the identity policy later via policy joining since its // the default passed from the policy-less method, but we don't do // anything here since we don't know until build() is called whether the // policy author wants to allow certain URL protocols or wants to deal // with styles. AttributePolicy oldPolicy = globalAttrPolicies.get(attributeName); globalAttrPolicies.put( attributeName, AttributePolicy.Util.join(oldPolicy, policy)); } return this; } private HtmlPolicyBuilder allowAttributesOnElements( AttributePolicy policy, List<String> attributeNames, List<String> elementNames) { invalidateCompiledState(); for (String elementName : elementNames) { Map<String, AttributePolicy> policies = attrPolicies.get(elementName); if (policies == null) { policies = Maps.newLinkedHashMap(); attrPolicies.put(elementName, policies); } for (String attributeName : attributeNames) { AttributePolicy oldPolicy = policies.get(attributeName); policies.put( attributeName, AttributePolicy.Util.join(oldPolicy, policy)); } } return this; } /** * Adds <a href="http://en.wikipedia.org/wiki/Nofollow"><code>rel=nofollow</code></a> * to links. */ public HtmlPolicyBuilder requireRelNofollowOnLinks() { invalidateCompiledState(); this.requireRelNofollowOnLinks = true; return this; } /** * Adds to the set of protocols that are allowed in URL attributes. * For each URL attribute that is allowed, we further constrain it by * only allowing the value through if it specifies no protocol, or if it * specifies one in the allowedProtocols white-list. * This is done regardless of whether any protocols have been allowed, so * allowing the attribute "href" globally with the identity policy but * not white-listing any protocols, effectively disallows the "href" * attribute globally. * <p> * Do not allow any <code>*script</code> such as <code>javascript</code> * protocols if you might use this policy with untrusted code. */ public HtmlPolicyBuilder allowUrlProtocols(String... protocols) { invalidateCompiledState(); // If there is at least one allowed protocol, then allow URLs and // add a filter that checks href and src values. // Do not allow href and srcs through otherwise, and only allow on images // and links. for (String protocol : protocols) { protocol = Strings.toLowerCase(protocol); allowedProtocols.add(protocol); } return this; } /** * Reverses a decision made by {@link #allowUrlProtocols}. */ public HtmlPolicyBuilder disallowUrlProtocols(String... protocols) { invalidateCompiledState(); for (String protocol : protocols) { protocol = Strings.toLowerCase(protocol); allowedProtocols.remove(protocol); } return this; } /** * A canned URL protocol policy that allows <code>http</code>, * <code>https</code>, and <code>mailto</code>. */ public HtmlPolicyBuilder allowStandardUrlProtocols() { return allowUrlProtocols("http", "https", "mailto"); } /** * Convert <code>style="&lt;CSS&gt;"</code> to simple non-JS containing * <code>&lt;font&gt;</code> tags to allow color, font-size, typeface, and * other styling. */ public HtmlPolicyBuilder allowStyling() { invalidateCompiledState(); allowStyling = true; return this; } /** * Names of attributes from HTML 4 whose values are URLs. * Other attributes, e.g. <code>style</code> may contain URLs even though * there values are not URLs. */ private static final Set<String> URL_ATTRIBUTE_NAMES = ImmutableSet.of( "action", "archive", "background", "cite", "classid", "codebase", "data", "dsync", "formaction", "href", "icon", "longdesc", "manifest", "poster", "profile", "src", "usemap"); /** * Produces a policy based on the allow and disallow calls previously made. * * @param out receives calls to open only tags allowed by * previous calls to this object. * Typically a {@link HtmlStreamRenderer}. */ public HtmlSanitizer.Policy build(HtmlStreamEventReceiver out) { return toFactory().apply(out); } /** * Like {@link #build} but can be reused to create many different policies * each backed by a different output channel. */ public PolicyFactory toFactory() { return new PolicyFactory(compilePolicies(), allowStyling); } // Speed up subsequent builds by caching the compiled policies. private transient ImmutableMap<String, ElementAndAttributePolicies> compiledPolicies; /** Called by mutators to signal that any compiled policy is out-of-date. */ private void invalidateCompiledState() { compiledPolicies = null; } private ImmutableMap<String, ElementAndAttributePolicies> compilePolicies() { if (compiledPolicies != null) { return compiledPolicies; } // Copy maps before normalizing in case builder is reused. Map<String, ElementPolicy> elPolicies = Maps.newLinkedHashMap(this.elPolicies); Map<String, Map<String, AttributePolicy>> attrPolicies = Maps.newLinkedHashMap(this.attrPolicies); for (Map.Entry<String, Map<String, AttributePolicy>> e : attrPolicies.entrySet()) { e.setValue(Maps.newLinkedHashMap(e.getValue())); } Map<String, AttributePolicy> globalAttrPolicies = Maps.newLinkedHashMap(this.globalAttrPolicies); Set<String> allowedProtocols = ImmutableSet.copyOf(this.allowedProtocols); // Implement requireRelNofollowOnLinks if (requireRelNofollowOnLinks) { elPolicies.put( "a", ElementPolicy.Util.join( elPolicies.get("a"), new ElementPolicy() { public String apply(String elementName, List<String> attrs) { for (int i = 0, n = attrs.size(); i < n; i += 2) { if ("href".equals(attrs.get(i))) { attrs.add("rel"); attrs.add("nofollow"); break; } } return elementName; } })); } // Implement protocol policies. // For each URL attribute that is allowed, we further constrain it by // only allowing the value through if it specifies no protocol, or if it // specifies one in the allowedProtocols white-list. // This is done regardless of whether any protocols have been allowed, so // allowing the attribute "href" globally with the identity policy but // not white-listing any protocols, effectively disallows the "href" // attribute globally. { AttributePolicy urlAttributePolicy; if (allowedProtocols.size() == 3 && allowedProtocols.contains("mailto") && allowedProtocols.contains("http") && allowedProtocols.contains("https")) { urlAttributePolicy = StandardUrlAttributePolicy.INSTANCE; } else { urlAttributePolicy = new FilterUrlByProtocolAttributePolicy( allowedProtocols); } Set<String> toGuard = Sets.newLinkedHashSet(URL_ATTRIBUTE_NAMES); for (String urlAttributeName : URL_ATTRIBUTE_NAMES) { if (globalAttrPolicies.containsKey(urlAttributeName)) { toGuard.remove(urlAttributeName); globalAttrPolicies.put(urlAttributeName, AttributePolicy.Util.join( urlAttributePolicy, globalAttrPolicies.get(urlAttributeName))); } } // Implement guards not implemented on global policies in the per-element // policy maps. for (Map.Entry<String, Map<String, AttributePolicy>> e : attrPolicies.entrySet()) { Map<String, AttributePolicy> policies = e.getValue(); for (String urlAttributeName : toGuard) { if (policies.containsKey(urlAttributeName)) { policies.put(urlAttributeName, AttributePolicy.Util.join( urlAttributePolicy, policies.get(urlAttributeName))); } } } } ImmutableMap.Builder<String, ElementAndAttributePolicies> policiesBuilder = ImmutableMap.builder(); for (Map.Entry<String, ElementPolicy> e : elPolicies.entrySet()) { String elementName = e.getKey(); ElementPolicy elPolicy = e.getValue(); if (ElementPolicy.REJECT_ALL_ELEMENT_POLICY.equals(elPolicy)) { continue; } Map<String, AttributePolicy> elAttrPolicies = attrPolicies.get(elementName); if (elAttrPolicies == null) { elAttrPolicies = ImmutableMap.of(); } ImmutableMap.Builder<String, AttributePolicy> attrs = ImmutableMap.builder(); for (Map.Entry<String, AttributePolicy> ape : elAttrPolicies.entrySet()) { String attributeName = ape.getKey(); if (globalAttrPolicies.containsKey(attributeName)) { continue; } AttributePolicy policy = ape.getValue(); if (!AttributePolicy.REJECT_ALL_ATTRIBUTE_POLICY.equals(policy)) { attrs.put(attributeName, policy); } } for (Map.Entry<String, AttributePolicy> ape : globalAttrPolicies.entrySet()) { String attributeName = ape.getKey(); AttributePolicy policy = AttributePolicy.Util.join( elAttrPolicies.get(attributeName), ape.getValue()); if (!AttributePolicy.REJECT_ALL_ATTRIBUTE_POLICY.equals(policy)) { attrs.put(attributeName, policy); } } policiesBuilder.put( elementName, new ElementAndAttributePolicies( elementName, elPolicy, attrs.build(), skipIfEmpty.contains(elementName))); } return compiledPolicies = policiesBuilder.build(); } /** * Builds the relationship between attributes, the values that they may have, * and the elements on which they may appear. * * @author Mike Samuel */ public final class AttributeBuilder { private final List<String> attributeNames; private AttributePolicy policy = AttributePolicy.IDENTITY_ATTRIBUTE_POLICY; AttributeBuilder(List<? extends String> attributeNames) { this.attributeNames = ImmutableList.copyOf(attributeNames); } /** * Filters and/or transforms the attribute values * allowed by later {@code allow*} calls. * Multiple calls to {@code matching} are combined so that the policies * receive the value in order, each seeing the value after any * transformation by a previous policy. */ public AttributeBuilder matching(AttributePolicy policy) { this.policy = AttributePolicy.Util.join(this.policy, policy); return this; } /** * Restrict the values allowed by later {@code allow*} calls to those * matching the pattern. * Multiple calls to {@code matching} are combined to restrict to the * intersection of possible matched values. */ public AttributeBuilder matching(final Pattern pattern) { return matching(new AttributePolicy() { public @Nullable String apply( String elementName, String attributeName, String value) { return pattern.matcher(value).matches() ? value : null; } }); } /** * Restrict the values allowed by later {@code allow*} calls to those * matching the given predicate. * Multiple calls to {@code matching} are combined to restrict to the * intersection of possible matched values. */ public AttributeBuilder matching( final Predicate<? super String> filter) { return matching(new AttributePolicy() { public @Nullable String apply( String elementName, String attributeName, String value) { return filter.apply(value) ? value : null; } }); } /** * Restrict the values allowed by later {@code allow*} calls to those * supplied. * Multiple calls to {@code matching} are combined to restrict to the * intersection of possible matched values. */ public AttributeBuilder matching( boolean ignoreCase, String... allowedValues) { return matching(ignoreCase, ImmutableSet.copyOf(allowedValues)); } /** * Restrict the values allowed by later {@code allow*} calls to those * supplied. * Multiple calls to {@code matching} are combined to restrict to the * intersection of possible matched values. */ public AttributeBuilder matching( final boolean ignoreCase, Set<? extends String> allowedValues) { final ImmutableSet<String> allowed = ImmutableSet.copyOf(allowedValues); return matching(new AttributePolicy() { public @Nullable String apply( String elementName, String attributeName, String value) { if (ignoreCase) { value = Strings.toLowerCase(value); } return allowed.contains(value) ? value : null; } }); } /** * Allows the given attributes on any elements but filters the * attributes' values based on previous calls to {@code matching(...)}. * Global attribute policies are applied after element specific policies. * Be careful of using this with attributes like <code>type</code> which * have different meanings on different attributes. * Also be careful of allowing globally attributes like <code>href</code> * which can have more far-reaching effects on tags like * <code>&lt;base&gt;</code> and <code>&lt;link&gt;</code> than on * <code>&lt;a&gt;</code> because in the former, they have an effect without * user interaction and can change the behavior of the current page. */ public HtmlPolicyBuilder globally() { return HtmlPolicyBuilder.this.allowAttributesGlobally( policy, attributeNames); } /** * Allows the named attributes on the given elements but filters the * attributes' values based on previous calls to {@code matching(...)}. */ public HtmlPolicyBuilder onElements(String... elementNames) { ImmutableList.Builder<String> b = ImmutableList.builder(); for (String elementName : elementNames) { b.add(HtmlLexer.canonicalName(elementName)); } return HtmlPolicyBuilder.this.allowAttributesOnElements( policy, attributeNames, b.build()); } } } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000162�00000000000�011564� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/ElementAndAttributePolicyBasedSanitizerPolicy.java���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������owasp-java-html-sanitizer-0.1+r88/src/main/java/org/owasp/html/ElementAndAttributePolicyBasedSanitiz0000664�0001750�0001750�00000014534�11654053470�034323� 0����������������������������������������������������������������������������������������������������ustar �jamespage�����������������������jamespage��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������// Copyright (c) 2011, Mike Samuel // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // Neither the name of the OWASP nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. package org.owasp.html; import java.util.List; import java.util.ListIterator; import javax.annotation.Nullable; import javax.annotation.concurrent.NotThreadSafe; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; /** * A sanitizer policy that applies element and attribute policies to tags. */ @TCB @NotThreadSafe class ElementAndAttributePolicyBasedSanitizerPolicy implements HtmlSanitizer.Policy { final ImmutableMap<String, ElementAndAttributePolicies> elAndAttrPolicies; private final HtmlStreamEventReceiver out; /** * True to skip textual content. Used to ignore the content of embedded CDATA * content that is not meant to be human-readable. */ transient boolean skipText = true; /** * Alternating input names and adjusted names of elements opened by the * caller. */ private final List<String> openElementStack = Lists.newArrayList(); ElementAndAttributePolicyBasedSanitizerPolicy( HtmlStreamEventReceiver out, ImmutableMap<String, ElementAndAttributePolicies> elAndAttrPolicies) { this.out = out; this.elAndAttrPolicies = elAndAttrPolicies; } static final ImmutableSet<String> SKIPPABLE_ELEMENT_CONTENT = ImmutableSet.of( "script", "style", "noscript", "nostyle", "noembed", "noframes", "iframe", "object", "frame", "frameset", "title"); public void openDocument() { skipText = false; openElementStack.clear(); out.openDocument(); } public void closeDocument() { for (int i = openElementStack.size() - 1; i >= 0; i -= 2) { String tagNameToClose = openElementStack.get(i); if (tagNameToClose != null) { out.closeTag(tagNameToClose); } } openElementStack.clear(); skipText = true; out.closeDocument(); } public void text(String textChunk) { if (!skipText) { out.text(textChunk); } } public void openTag(String elementName, List<String> attrs) { // StylingPolicy repeats some of this code because it is more complicated // to refactor it into multiple method bodies, so if you change this, // check the override of it in that class. ElementAndAttributePolicies policies = elAndAttrPolicies.get(elementName); String adjustedElementName = applyPolicies(elementName, attrs, policies); if (adjustedElementName != null && !(attrs.isEmpty() && policies.skipIfEmpty)) { skipText = false; writeOpenTag(policies, adjustedElementName, attrs); return; } deferOpenTag(elementName); } final @Nullable String applyPolicies( String elementName, List<String> attrs, ElementAndAttributePolicies policies) { String adjustedElementName; if (policies != null) { for (ListIterator<String> attrsIt = attrs.listIterator(); attrsIt.hasNext();) { String name = attrsIt.next(); AttributePolicy attrPolicy = policies.attrPolicies.get(name); if (attrPolicy == null) { attrsIt.remove(); attrsIt.next(); attrsIt.remove(); } else { String value = attrsIt.next(); String adjustedValue = attrPolicy.apply(elementName, name, value); if (adjustedValue == null) { attrsIt.remove(); attrsIt.previous(); attrsIt.remove(); } else { attrsIt.set(adjustedValue); } } } adjustedElementName = policies.elPolicy.apply(elementName, attrs); } else { adjustedElementName = null; } return adjustedElementName; } public void closeTag(String elementName) { skipText = false; int n = openElementStack.size(); for (int i = n; i > 0;) { i -= 2; String openElementName = openElementStack.get(i); if (elementName.equals(openElementName)) { for (int j = n - 1; j > i; j -= 2) { String tagNameToClose = openElementStack.get(j); if (tagNameToClose != null) { out.closeTag(tagNameToClose); } } openElementStack.subList(i, n).clear(); return; } } } void writeOpenTag( ElementAndAttributePolicies policies, String adjustedElementName, List<String> attrs) { if (!policies.isVoid) { openElementStack.add(policies.elementName); openElementStack.add(adjustedElementName); } out.openTag(adjustedElementName, attrs); } void deferOpenTag(String elementName) { if (HtmlTextEscapingMode.isVoidElement(elementName)) { openElementStack.add(elementName); openElementStack.add(null); } skipText = SKIPPABLE_ELEMENT_CONTENT.contains(elementName); } void synthesizeOpenTag(String adjustedElementName, List<String> attrs) { openElementStack.add(null); openElementStack.add(adjustedElementName); out.openTag(adjustedElementName, attrs); } }��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������