package/LICENSE000644 0000002322 3560116604 010264 0ustar00000000 000000 Copyright 2019-present is held by the authors of the safe-regex module. This software is released under the MIT license: Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Original author: James Halliday @substack Maintainer: James C. (Jamie) Davis @davisjam package/lib/analyzer-family.js000644 0000000170 3560116604 013466 0ustar00000000 000000 // Load the analyzers const heuristicAnalyzer = require("./heuristic-analyzer"); module.exports = [heuristicAnalyzer]; package/lib/analyzer.js000644 0000003160 3560116604 012211 0ustar00000000 000000 // Generic options class AnalyzerOptions { constructor(heuristic_replimit) { this.heuristic_replimit = heuristic_replimit; } } class AttackString { constructor(prefixAndPumpList, suffix) { this.prefixAndPumpList = prefixAndPumpList; this.suffix = suffix; } } // Abstract class class Analyzer { constructor(analyzerOptions) { this.options = analyzerOptions; } // Subclasser must implement // Return boolean isVulnerable(regExp) { return false; } // Subclass must implement // Returns an AttackString or null genAttackString(regExp) { return null; } } module.exports = function(re, replimit) { // Build an AST let myRegExp = null; let ast = null; try { // Construct a RegExp object if (re instanceof RegExp) { myRegExp = re; } else if (typeof re === "string") { myRegExp = new RegExp(re); } else { myRegExp = new RegExp(String(re)); } // Build an AST ast = regexpTree.parse(myRegExp); } catch (err) { // Invalid or unparseable input return false; } let currentStarHeight = 0; let maxObservedStarHeight = 0; let repetitionCount = 0; regexpTree.traverse(ast, { Repetition: { pre({ node }) { repetitionCount++; currentStarHeight++; if (maxObservedStarHeight < currentStarHeight) { maxObservedStarHeight = currentStarHeight; } }, post({ node }) { currentStarHeight--; } } }); return maxObservedStarHeight <= 1 && repetitionCount <= replimit; }; module.exports = { "AnalyzerOptions": AnalyzerOptions, "Analyzer": Analyzer, }; package/bin/cli.js000755 0000001673 3560116604 011147 0ustar00000000 000000 #!/usr/bin/env node // CLI for querying safe-regex for safety analysis // Input: JSON-formatted file with an object with key 'pattern' // Output: STDOUT: JSON-formatted object with new key 'isSafe' and value 0 or 1 // STDERR: Progress updates var safe = require('../'), fs = require('fs'); if (process.argv.length != 3) { console.error(`Usage: ${process.argv[1]} pattern.json`); process.exit(1); } const file = process.argv[2]; // Get pattern const cont = fs.readFileSync(file, 'utf-8'); let pattern = JSON.parse(cont); // Can analyze? Is safe? let canAnalyze = 0; let isSafe = 0; try { isSafe = safe(pattern.pattern) ? 1 : 0; canAnalyze = 1; } catch (e) { canAnalyze = 0; isSafe = 'unknown'; } pattern.canAnalyze = canAnalyze; pattern.isSafe = isSafe; // Emit. console.error(`Pattern /${pattern.pattern}/: canAnalyze ${pattern.canAnalyze} isSafe ${pattern.isSafe}`); console.log(JSON.stringify(pattern)); process.exit(0); package/lib/heuristic-analyzer.js000644 0000002741 3560116604 014212 0ustar00000000 000000 // Exports an Analyzer subclass const regexpTree = require("regexp-tree"); const analyzer = require("./analyzer"); class HeuristicAnalyzer extends analyzer.Analyzer { constructor(analyzerOptions) { super(analyzerOptions); } isVulnerable(regExp) { // Heuristic #1: Star height > 1 const starHeight = this._measureStarHeight(regExp); if (starHeight > 1) { return true; } // Heuristic #2: # repetitions > limit // TODO This is a poor heuristic const nRepetitions = this._measureRepetitions(regExp); if (nRepetitions > this.options.heuristic_replimit) { return true; } return false; } genAttackString(regExp) { return null; } _measureStarHeight(regExp) { let currentStarHeight = 0; let maxObservedStarHeight = 0; const ast = regexpTree.parse(regExp); regexpTree.traverse(ast, { Repetition: { pre({ node }) { currentStarHeight++; if (maxObservedStarHeight < currentStarHeight) { maxObservedStarHeight = currentStarHeight; } }, post({ node }) { currentStarHeight--; } } }); return maxObservedStarHeight; } _measureRepetitions(regExp) { let nRepetitions = 0; const ast = regexpTree.parse(regExp); regexpTree.traverse(ast, { Repetition: { pre({ node }) { nRepetitions++; } } }); return nRepetitions; } } module.exports = HeuristicAnalyzer; package/index.js000644 0000003370 3560116604 010730 0ustar00000000 000000 const analyzer = require('./lib/analyzer'); const analyzerFamily = require('./lib/analyzer-family'); const DEFAULT_SAFE_REP_LIMIT = 25; const RET_IS_SAFE = true; const RET_IS_VULNERABLE = false; class Args { constructor(regExp, analyzerOptions) { this.regExp = regExp; this.analyzerOptions = analyzerOptions; } } function safeRegex(re, opts) { try { const args = buildArgs(re, opts); const analyzerResponses = askAnalyzersIfVulnerable(args); // Did any analyzer say true? if (analyzerResponses.find((isVulnerable) => isVulnerable)) { return RET_IS_VULNERABLE; } else { return RET_IS_SAFE; } } catch (err) { // Invalid or unparseable input return false; } } function buildArgs(re, opts) { // Build AnalyzerOptions if (!opts) opts = {}; const heuristic_replimit = opts.limit === undefined ? DEFAULT_SAFE_REP_LIMIT : opts.limit; const analyzerOptions = new analyzer.AnalyzerOptions(heuristic_replimit); // Build RegExp let regExp = null; // Construct a RegExp object if (re instanceof RegExp) { regExp = re; } else if (typeof re === 'string') { regExp = new RegExp(re); } else { regExp = new RegExp(String(re)); } return new Args(regExp, analyzerOptions); } function askAnalyzersIfVulnerable(args) { let analyzerSaysVulnerable = []; // Query the Analyzers let Analyzer; for (Analyzer of analyzerFamily) { try { const analyzer = new Analyzer(args.analyzerOptions); analyzerSaysVulnerable.push(analyzer.isVulnerable(args.regExp)); } catch (err) { /* istanbul ignore next */ // No need to worry about code coverage here. analyzerSaysVulnerable.push(false); } } return analyzerSaysVulnerable; } // Export module.exports = safeRegex;package/test/regex.spec.js000644 0000007376 3560116604 012655 0ustar00000000 000000 const safeRegex = require("../"); const REPETITION_LIMIT = 25; const REPETITION_TOO_MUCH = REPETITION_LIMIT + 1; // TODO Named character classes test("linear-time regexes are safe", () => { const linTime = [ // regular RE's /a/, /a*/, /a+/, /a?/, /a{3,5}/, /a|b/, /(ab)/, /(ab)\1/, /\bOakland\b/, /(a+)|(b+)/, // RE's in a string "^foo/bar", // non-RE, non-string 1, ]; linTime.forEach(re => { expect(safeRegex(re)).toBe(true); }); }); test("linear-time regexes are safe, under varying repetition limits", () => { const re1 = RegExp("a?".repeat(REPETITION_LIMIT) + "a".repeat(REPETITION_LIMIT)); expect(safeRegex(re1)).toBe(true); const LOW_REPETITION_LIMIT = 3; const re2 = RegExp(Array(LOW_REPETITION_LIMIT).join("a?")); expect(safeRegex(re2, { limit: LOW_REPETITION_LIMIT })).toBe(true); }); test("poly-time regexes are safe (at least according to our heuristics)", () => { const polyTime = [ /^a+a+$/, // QOA /^a+aa+$/, // QOA with obvious intermediate run /^a+aaaa+$/, // QOA with obvious intermediate run /^a+[a-z]a+$/, // QOA with obvious intermediate run /^a+\wa+$/, // QOA with intermediate character class /^a+(\w|\d)a+$/, // QOA with valid path through /^a+b?a+$/, // QOA with valid path through /^a+(cde)*a+$/, // QOA with valid path through /^.*a*$/, // QOA by subset /^\w*\d*$/, // QOA by intersection /^\S+@\S+\.\S+$/, // Example from Django /a+$/, // QOA under partial-match /abc.*$/, // QOA under partial-match // TODO It would be nice to have one of the regexes that are poly-time even when they match, due to non-greedy quantifiers (p-NFA) ]; polyTime.forEach(re => { expect(safeRegex(re)).toBe(true); }); }); test("exp-time regexes due to star height are unsafe", () => { const expTime = [ // Straightforward star height /(a*)*$/, /(a?)*$/, /(a*)?$/, /(a*)+$/, /(a+)*$/, /(\wa+)*$/, // Prefix /(\..*)*$/, // Suffix // Branching and nesting. /(a*|b)+$/, /(a|b*)+$/, /(((b*)))+$/, /(((b*))+)$/, /(((b*)+))$/, /(((b)*)+)$/, /(((b)*))+$/, // Misc. more complex cases /^(a?){25}(a){25}$/, /(x+x+)+y/, /foo|(x+x+)+y/, /(a+){10}y/, /(a+){2}y/, /(.*){1,32000}[bc]/, // RE's in a string "(a+)+", ]; expTime.forEach(re => { expect(safeRegex(re)).toBe(false); }); }); test("linear-time regexes with star height > 1", () => { // TODO These are false positives, Fix once we improve analysis const linTime = [ /(ab*)+$/, /(b*a)+$/, ]; linTime.forEach(re => { expect(safeRegex(re)).toBe(false); }); }); test("exp-time regexes due to disjunction are safe (according to current heuristics)", () => { // TODO These are false negatives. Fix once we improve analysis const expTime = [ /(a|a)*$/, // QOD: obvious /(a|\w)*$/, // QOD due to overlap /([abc]|b)*$/, // QOD due to overlap /(\w\w\w|bab)*$/, // QOD due to overlap, with multi-step internal paths ]; expTime.forEach(re => { expect(safeRegex(re)).toBe(true); }); }); test("regex that exceeds repetition limit is unsafe", () => { const re1 = RegExp("a?".repeat(REPETITION_TOO_MUCH) + "a".repeat(REPETITION_TOO_MUCH)); expect(safeRegex(re1)).toBe(false); const LOW_REPETITION_LIMIT = 3; const re2 = RegExp("a?".repeat(LOW_REPETITION_LIMIT + 1)); expect(safeRegex(re2, { limit: LOW_REPETITION_LIMIT })).toBe(false); }); test("invalid regexes default to unsafe", () => { const invalid = [ "(a+", "[a-z", "*Oakland*", "hey(yoo) )", "abcde(?>hellow)", "[abc", ]; invalid.forEach(re => { expect(safeRegex(re)).toBe(false); }); });package/example/safe.js000644 0000000142 3560116604 012164 0ustar00000000 000000 var safe = require('../'); var regex = process.argv.slice(2).join(' '); console.log(safe(regex)); package/package.json000644 0000002153 3560116604 011547 0ustar00000000 000000 { "name": "safe-regex", "version": "2.1.1", "description": "detect possibly catastrophic, exponential-time regular expressions", "main": "index.js", "dependencies": { "regexp-tree": "~0.1.1" }, "devDependencies": { "jest": "^24.9.0" }, "scripts": { "test": "jest" }, "jest": { "moduleFileExtensions": [ "js" ], "testRegex": "test.*\\.spec\\.js$", "collectCoverage": true, "coverageReporters": [ "text-summary", "html", "lcov" ], "collectCoverageFrom": [ "*.js" ], "coverageThreshold": { "global": { "statements": 100, "branches": 100, "functions": 100, "lines": 100 } } }, "repository": { "type": "git", "url": "git://github.com/davisjam/safe-regex.git" }, "homepage": "https://github.com/davisjam/safe-regex", "keywords": [ "catastrophic", "exponential", "regex", "safe", "sandbox" ], "author": { "name": "James C. (Jamie) Davis", "email": "davisjam@vt.edu", "url": "http://people.cs.vt.edu/~davisjam" }, "license": "MIT" } package/CHANGELOG.md000644 0000000512 3560116604 011067 0ustar00000000 000000 # v2 ## v2.0 ## v2.0.0 1. Update README. 2. Switch AST library from ret to regexp-tree. 3. Fix incorrect handling of nested quantifiers in disjunctions. 4. Enhance test suite. Contributors: - [davisjam](https://github.com/davisjam) # v1 This is the historic release. Contributors: - [substack](https://github.com/substack) package/README.md000644 0000007112 3560116604 010540 0ustar00000000 000000 # safe-regex Detect potentially [catastrophic](http://regular-expressions.mobi/catastrophic.html) [exponential-time](http://perlgeek.de/blog-en/perl-tips/in-search-of-an-exponetial-regexp.html) regular expressions by limiting the [star height](https://en.wikipedia.org/wiki/Star_height) to 1. WARNING: This module has both false positives and false negatives. Use [vuln-regex-detector](https://github.com/davisjam/vuln-regex-detector) for improved accuracy. [![Build Status](https://travis-ci.org/davisjam/safe-regex.svg?branch=master)](https://travis-ci.org/davisjam/safe-regex) ## Example Suppose you have a script named `safe.js`: ``` js var safe = require('safe-regex'); var regex = process.argv.slice(2).join(' '); console.log(safe(regex)); ``` This is its behavior: ``` $ node safe.js '(x+x+)+y' false $ node safe.js '(beep|boop)*' true $ node safe.js '(a+){10}' false $ node safe.js '\blocation\s*:[^:\n]+\b(Oakland|San Francisco)\b' true ``` ## Methods ``` js const safe = require('safe-regex') ``` ### const ok = safe(re, opts={}) Return a boolean `ok` whether or not the regex `re` is safe and not possibly catastrophic. `re` can be a `RegExp` object or just a string. If the `re` is a string and is an invalid regex, returns `false`. * `opts.limit` - maximum number of allowed repetitions in the entire regex. Default: `25`. ## Install With [npm](https://npmjs.org) do: ``` npm install safe-regex ``` ## Resources ### What should I do if my project has a super-linear regex? 1. Confirm that it is *reachable* by untrusted input. 2. If it is, you can consider whether you can prevent worst-case behavior by trimming the input, revising the regex, or replacing the regex with another algorithm like string functions. For examples, see Table 5 in [this article](http://people.cs.vt.edu/davisjam/downloads/publications/DavisCoghlanServantLee-EcosystemREDOS-ESECFSE18.pdf). 3. If none of those solutions looks feasible, you might also consider changing regex engines. The [RE2 bindings](https://www.npmjs.com/package/re2) might work, though test carefully to confirm there are no [semantic portability problems](https://medium.com/@davisjam/why-arent-regexes-a-lingua-franca-esecfse19-a36348df3a2?source=friends_link&sk=d21be7f8f723e2080dc993385c6973d1). ### Further reading The following documents may be edifying: - [Research brief on the extent of super-linear regexes in practice](https://medium.com/@davisjam/introduction-987fdc4c7b0?source=friends_link&sk=ceefa4a4ca9617e08ab782c3b1580aea) - [Research brief on the variability of super-linear regex behavior across programming languages](https://medium.com/@davisjam/why-arent-regexes-a-lingua-franca-esecfse19-a36348df3a2?source=friends_link&sk=d21be7f8f723e2080dc993385c6973d1) - [Comparing regex matching algorithms](https://swtch.com/~rsc/regexp/regexp1.html) ## Project policies ### Versioning This project follows [Semantic Versioning 2.0 (semver)](https://semver.org/). Here are the project-specific meanings of MAJOR, MINOR, and PATCH updates: - MAJOR: "Incompatible" API changes were introduced. There are two types in this module: - Changes that modify the interface - Changes that cause any regexes to be marked as unsafe that were formerly marked as safe - MINOR: Functionality was added in a backwards-compatible manner. There are two types in this module: - Refactoring the analyses but not changing their results - Modifying the analyses to reduce false positives, without affecting negatives (false or true) - PATCH: I don't anticipate using PATCH for this module ### License [MIT](https://github.com/davisjam/safe-regex/blob/master/LICENSE)package/.travis.yml000644 0000000230 3560116604 011364 0ustar00000000 000000 sudo: required language: node_js node_js: - "10" - "12" install: - npm install script: - npm test cache: directories: - node_modules