pax_global_header00006660000000000000000000000064130422060110014477gustar00rootroot0000000000000052 comment=d315b61cf6727664f310fa87b3197e9faf2a8513 xurls-1.1.0/000077500000000000000000000000001304220601100126535ustar00rootroot00000000000000xurls-1.1.0/.gitignore000066400000000000000000000001041304220601100146360ustar00rootroot00000000000000cmd/xurls/xurls generate/tldsgen/tldsgen generate/regexgen/regexgen xurls-1.1.0/.travis.yml000066400000000000000000000000341304220601100147610ustar00rootroot00000000000000language: go go: - 1.7.x xurls-1.1.0/LICENSE000066400000000000000000000027201304220601100136610ustar00rootroot00000000000000Copyright (c) 2015, Daniel Martí. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xurls-1.1.0/README.md000066400000000000000000000022651304220601100141370ustar00rootroot00000000000000# xurls [![GoDoc](https://godoc.org/github.com/mvdan/xurls?status.svg)](https://godoc.org/github.com/mvdan/xurls) [![Travis](https://travis-ci.org/mvdan/xurls.svg?branch=master)](https://travis-ci.org/mvdan/xurls) Extract urls from text using regular expressions. go get -u github.com/mvdan/xurls ```go import "github.com/mvdan/xurls" func main() { xurls.Relaxed.FindString("Do gophers live in golang.org?") // "golang.org" xurls.Strict.FindAllString("foo.com is http://foo.com/.", -1) // []string{"http://foo.com/"} } ``` `Relaxed` is around five times slower than `Strict` since it does more work to find the URLs without relying on the scheme: ``` BenchmarkStrictEmpty-4 1000000 1885 ns/op BenchmarkStrictSingle-4 200000 8356 ns/op BenchmarkStrictMany-4 100000 22547 ns/op BenchmarkRelaxedEmpty-4 200000 7284 ns/op BenchmarkRelaxedSingle-4 30000 58557 ns/op BenchmarkRelaxedMany-4 10000 130251 ns/op ``` #### cmd/xurls go get -u github.com/mvdan/xurls/cmd/xurls ```shell $ echo "Do gophers live in http://golang.org?" | xurls http://golang.org ``` xurls-1.1.0/cmd/000077500000000000000000000000001304220601100134165ustar00rootroot00000000000000xurls-1.1.0/cmd/xurls/000077500000000000000000000000001304220601100145735ustar00rootroot00000000000000xurls-1.1.0/cmd/xurls/main.go000066400000000000000000000032071304220601100160500ustar00rootroot00000000000000// Copyright (c) 2015, Daniel Martí // See LICENSE for licensing information package main import ( "bufio" "flag" "fmt" "os" "regexp" "github.com/mvdan/xurls" ) var ( matching = flag.String("m", "", "") relaxed = flag.Bool("r", false, "") ) func init() { flag.Usage = func() { p := func(format string, a ...interface{}) { fmt.Fprintf(os.Stderr, format, a...) } p("Usage: xurls [-h] [files]\n\n") p("If no files are given, it reads from standard input.\n\n") p(" -m only match urls whose scheme matches a regexp\n") p(" example: 'https?://|mailto:'\n") p(" -r also match urls without a scheme (relaxed)\n") } } func scanPath(re *regexp.Regexp, path string) error { r := os.Stdin if path != "-" { f, err := os.Open(path) if err != nil { return err } defer f.Close() r = f } scanner := bufio.NewScanner(r) scanner.Split(bufio.ScanWords) for scanner.Scan() { word := scanner.Text() for _, match := range re.FindAllString(word, -1) { fmt.Println(match) } } return scanner.Err() } func main() { flag.Parse() if *relaxed && *matching != "" { errExit(fmt.Errorf("-r and -m at the same time don't make much sense")) } re := xurls.Strict if *relaxed { re = xurls.Relaxed } else if *matching != "" { var err error if re, err = xurls.StrictMatchingScheme(*matching); err != nil { errExit(err) } } args := flag.Args() if len(args) == 0 { args = []string{"-"} } for _, path := range args { if err := scanPath(re, path); err != nil { errExit(err) } } } func errExit(err error) { fmt.Fprintln(os.Stderr, err) os.Exit(1) } xurls-1.1.0/example_test.go000066400000000000000000000006221304220601100156740ustar00rootroot00000000000000// Copyright (c) 2015, Daniel Martí // See LICENSE for licensing information package xurls_test import ( "fmt" "github.com/mvdan/xurls" ) func Example() { fmt.Println(xurls.Relaxed.FindString("Do gophers live in http://golang.org?")) fmt.Println(xurls.Relaxed.FindAllString("foo.com is http://foo.com/.", -1)) // Output: // http://golang.org // [foo.com http://foo.com/] } xurls-1.1.0/generate/000077500000000000000000000000001304220601100144455ustar00rootroot00000000000000xurls-1.1.0/generate/regexgen/000077500000000000000000000000001304220601100162515ustar00rootroot00000000000000xurls-1.1.0/generate/regexgen/main.go000066400000000000000000000026621304220601100175320ustar00rootroot00000000000000// Copyright (c) 2015, Daniel Martí // See LICENSE for licensing information package main import ( "log" "os" "sort" "strings" "text/template" "golang.org/x/net/idna" "github.com/mvdan/xurls" ) const path = "regex.go" var regexTmpl = template.Must(template.New("regex").Parse(`// Generated by regexgen package xurls const ( {{- range $key, $value := . }} {{$key}} = ` + "`" + `{{$value}}` + "`" + ` {{- end}} ) `)) func writeRegex(tlds []string) error { allTldsSet := make(map[string]struct{}) add := func(tld string) { if _, e := allTldsSet[tld]; e { log.Fatalf("Duplicate TLD: %s", tld) } allTldsSet[tld] = struct{}{} } for _, tldlist := range [...][]string{tlds, xurls.PseudoTLDs} { for _, tld := range tldlist { add(tld) asciiTld, err := idna.ToASCII(tld) if err != nil { return err } if asciiTld != tld { add(asciiTld) } } } var allTlds []string for tld := range allTldsSet { allTlds = append(allTlds, tld) } sort.Strings(allTlds) f, err := os.Create(path) if err != nil { return err } defer f.Close() return regexTmpl.Execute(f, map[string]string{ "gtld ": `(?i)(` + strings.Join(allTlds, `|`) + `)(?-i)`, "otherScheme": `(?i)(` + strings.Join(xurls.SchemesNoAuthority, `|`) + `)(?-i):`, }) } func main() { log.Printf("Generating %s...", path) if err := writeRegex(xurls.TLDs); err != nil { log.Fatalf("Could not write %s: %v", path, err) } } xurls-1.1.0/generate/tldsgen/000077500000000000000000000000001304220601100161055ustar00rootroot00000000000000xurls-1.1.0/generate/tldsgen/main.go000066400000000000000000000050701304220601100173620ustar00rootroot00000000000000// Copyright (c) 2015, Daniel Martí // See LICENSE for licensing information package main import ( "bufio" "errors" "log" "net/http" "os" "regexp" "sort" "strings" "sync" "text/template" ) const path = "tlds.go" var tldsTmpl = template.Must(template.New("tlds").Parse(`// Generated by tldsgen package xurls // TLDs is a sorted list of all public top-level domains. // // Sources:{{range $_, $url := .URLs}} // * {{$url}}{{end}} var TLDs = []string{ {{range $_, $tld := .TLDs}}` + "\t`" + `{{$tld}}` + "`" + `, {{end}}} `)) func cleanTld(tld string) string { tld = strings.ToLower(tld) if strings.HasPrefix(tld, "xn--") { return "" } return tld } func fetchFromURL(url, pat string) { defer wg.Done() log.Printf("Fetching %s", url) resp, err := http.Get(url) if err == nil && resp.StatusCode >= 400 { err = errors.New(resp.Status) } if err != nil { errChan <- err return } defer resp.Body.Close() scanner := bufio.NewScanner(resp.Body) re := regexp.MustCompile(pat) for scanner.Scan() { line := scanner.Text() tld := re.FindString(line) tld = cleanTld(tld) if tld == "" { continue } tldChan <- tld } if err := scanner.Err(); err != nil { errChan <- err } } var ( wg sync.WaitGroup tldChan = make(chan string) errChan = make(chan error) ) func tldList() ([]string, []string, error) { var urls []string fromURL := func(url, pat string) { urls = append(urls, url) wg.Add(1) go fetchFromURL(url, pat) } fromURL("https://data.iana.org/TLD/tlds-alpha-by-domain.txt", `^[^#]+$`) fromURL("https://publicsuffix.org/list/effective_tld_names.dat", `^[^/.]+$`) tldSet := make(map[string]struct{}) anyError := false go func() { for { select { case tld := <-tldChan: tldSet[tld] = struct{}{} case err := <-errChan: log.Printf("%v", err) anyError = true } } }() wg.Wait() if anyError { return nil, nil, errors.New("there were some errors while fetching the TLDs") } tlds := make([]string, 0, len(tldSet)) for tld := range tldSet { tlds = append(tlds, tld) } sort.Strings(tlds) return tlds, urls, nil } func writeTlds(tlds, urls []string) error { f, err := os.Create(path) if err != nil { return err } defer f.Close() return tldsTmpl.Execute(f, struct { TLDs []string URLs []string }{ TLDs: tlds, URLs: urls, }) } func main() { tlds, urls, err := tldList() if err != nil { log.Fatalf("Could not get TLD list: %v", err) } log.Printf("Generating %s...", path) if err := writeTlds(tlds, urls); err != nil { log.Fatalf("Could not write path: %v", err) } } xurls-1.1.0/regex.go000066400000000000000000000276111304220601100143230ustar00rootroot00000000000000// Generated by regexgen package xurls const ( gtld = `(?i)(aaa|aarp|abarth|abb|abbott|abbvie|abc|able|abogado|abudhabi|ac|academy|accenture|accountant|accountants|aco|active|actor|ad|adac|ads|adult|ae|aeg|aero|aetna|af|afamilycompany|afl|africa|ag|agakhan|agency|ai|aig|aigo|airbus|airforce|airtel|akdn|al|alfaromeo|alibaba|alipay|allfinanz|allstate|ally|alsace|alstom|am|americanexpress|americanfamily|amex|amfam|amica|amsterdam|analytics|android|anquan|anz|ao|aol|apartments|app|apple|aq|aquarelle|ar|arab|aramco|archi|army|arpa|art|arte|as|asda|asia|associates|at|athleta|attorney|au|auction|audi|audible|audio|auspost|author|auto|autos|avianca|aw|aws|ax|axa|az|azure|ba|baby|baidu|banamex|bananarepublic|band|bank|bar|barcelona|barclaycard|barclays|barefoot|bargains|baseball|basketball|bauhaus|bayern|bb|bbc|bbt|bbva|bcg|bcn|bd|be|beats|beauty|beer|bentley|berlin|best|bestbuy|bet|bf|bg|bh|bharti|bi|bible|bid|bike|bing|bingo|bio|bit|biz|bj|black|blackfriday|blanco|blockbuster|blog|bloomberg|blue|bm|bms|bmw|bn|bnl|bnpparibas|bo|boats|boehringer|bofa|bom|bond|boo|book|booking|boots|bosch|bostik|boston|bot|boutique|box|br|bradesco|bridgestone|broadway|broker|brother|brussels|bs|bt|budapest|bugatti|build|builders|business|buy|buzz|bv|bw|by|bz|bzh|ca|cab|cafe|cal|call|calvinklein|cam|camera|camp|cancerresearch|canon|capetown|capital|capitalone|car|caravan|cards|care|career|careers|cars|cartier|casa|case|caseih|cash|casino|cat|catering|catholic|cba|cbn|cbre|cbs|cc|cd|ceb|center|ceo|cern|cf|cfa|cfd|cg|ch|chanel|channel|chase|chat|cheap|chintai|chloe|christmas|chrome|chrysler|church|ci|cipriani|circle|cisco|citadel|citi|citic|city|cityeats|ck|cl|claims|cleaning|click|clinic|clinique|clothing|cloud|club|clubmed|cm|cn|co|coach|codes|coffee|college|cologne|com|comcast|commbank|community|company|compare|computer|comsec|condos|construction|consulting|contact|contractors|cooking|cookingchannel|cool|coop|corsica|country|coupon|coupons|courses|cr|credit|creditcard|creditunion|cricket|crown|crs|cruise|cruises|csc|cu|cuisinella|cv|cw|cx|cy|cymru|cyou|cz|dabur|dad|dance|data|date|dating|datsun|day|dclk|dds|de|deal|dealer|deals|degree|delivery|dell|deloitte|delta|democrat|dental|dentist|desi|design|dev|dhl|diamonds|diet|digital|direct|directory|discount|discover|dish|diy|dj|dk|dm|dnp|do|docs|doctor|dodge|dog|doha|domains|dot|download|drive|dtv|dubai|duck|dunlop|duns|dupont|durban|dvag|dvr|dwg|dz|earth|eat|ec|eco|edeka|edu|education|ee|eg|email|emerck|energy|engineer|engineering|enterprises|epost|epson|equipment|er|ericsson|erni|es|esq|estate|esurance|et|etisalat|eu|eurovision|eus|events|everbank|example|exchange|exit|expert|exposed|express|extraspace|fage|fail|fairwinds|faith|family|fan|fans|farm|farmers|fashion|fast|fedex|feedback|ferrari|ferrero|fi|fiat|fidelity|fido|film|final|finance|financial|fire|firestone|firmdale|fish|fishing|fit|fitness|fj|fk|flickr|flights|flir|florist|flowers|fly|fm|fo|foo|food|foodnetwork|football|ford|forex|forsale|forum|foundation|fox|fr|free|fresenius|frl|frogans|frontdoor|frontier|ftr|fujitsu|fujixerox|fun|fund|furniture|futbol|fyi|ga|gal|gallery|gallo|gallup|game|games|gap|garden|gb|gbiz|gd|gdn|ge|gea|gent|genting|george|gf|gg|ggee|gh|gi|gift|gifts|gives|giving|gl|glade|glass|gle|global|globo|gm|gmail|gmbh|gmo|gmx|gn|gnu|godaddy|gold|goldpoint|golf|goo|goodhands|goodyear|goog|google|gop|got|gov|gp|gq|gr|grainger|graphics|gratis|green|gripe|grocery|group|gs|gt|gu|guardian|gucci|guge|guide|guitars|guru|gw|gy|hair|hamburg|hangout|haus|hbo|hdfc|hdfcbank|health|healthcare|help|helsinki|here|hermes|hgtv|hiphop|hisamitsu|hitachi|hiv|hk|hkt|hm|hn|hockey|holdings|holiday|homedepot|homegoods|homes|homesense|honda|honeywell|horse|hospital|host|hosting|hot|hoteles|hotels|hotmail|house|how|hr|hsbc|ht|htc|hu|hughes|hyatt|hyundai|i2p|ibm|icbc|ice|icu|id|ie|ieee|ifm|iinet|ikano|il|im|imamat|imdb|immo|immobilien|in|industries|infiniti|info|ing|ink|institute|insurance|insure|int|intel|international|intuit|invalid|investments|io|ipiranga|iq|ir|irish|is|iselect|ismaili|ist|istanbul|it|itau|itv|iveco|iwc|jaguar|java|jcb|jcp|je|jeep|jetzt|jewelry|jio|jlc|jll|jm|jmp|jnj|jo|jobs|joburg|jot|joy|jp|jpmorgan|jprs|juegos|juniper|kaufen|kddi|ke|kerryhotels|kerrylogistics|kerryproperties|kfh|kg|kh|ki|kia|kim|kinder|kindle|kitchen|kiwi|km|kn|koeln|komatsu|kosher|kp|kpmg|kpn|kr|krd|kred|kuokgroup|kw|ky|kyoto|kz|la|lacaixa|ladbrokes|lamborghini|lamer|lancaster|lancia|lancome|land|landrover|lanxess|lasalle|lat|latino|latrobe|law|lawyer|lb|lc|lds|lease|leclerc|lefrak|legal|lego|lexus|lgbt|li|liaison|lidl|life|lifeinsurance|lifestyle|lighting|like|lilly|limited|limo|lincoln|linde|link|lipsy|live|living|lixil|lk|loan|loans|local|localhost|locker|locus|loft|lol|london|lotte|lotto|love|lpl|lplfinancial|lr|ls|lt|ltd|ltda|lu|lundbeck|lupin|luxe|luxury|lv|ly|ma|macys|madrid|maif|maison|makeup|man|management|mango|map|market|marketing|markets|marriott|marshalls|maserati|mattel|mba|mc|mcd|mcdonalds|mckinsey|md|me|med|media|meet|melbourne|meme|memorial|men|menu|meo|merckmsd|metlife|mg|mh|miami|microsoft|mil|mini|mint|mit|mitsubishi|mk|ml|mlb|mls|mm|mma|mn|mo|mobi|mobile|mobily|moda|moe|moi|mom|monash|money|monster|montblanc|mopar|mormon|mortgage|moscow|moto|motorcycles|mov|movie|movistar|mp|mq|mr|ms|msd|mt|mtn|mtpc|mtr|mu|museum|mutual|mutuelle|mv|mw|mx|my|mz|na|nab|nadex|nagoya|name|nationwide|natura|navy|nba|nc|ne|nec|net|netbank|netflix|network|neustar|new|newholland|news|next|nextdirect|nexus|nf|nfl|ng|ngo|nhk|ni|nico|nike|nikon|ninja|nissan|nissay|nl|no|nokia|northwesternmutual|norton|now|nowruz|nowtv|np|nr|nra|nrw|ntt|nu|nyc|nz|obi|observer|off|office|okinawa|olayan|olayangroup|oldnavy|ollo|om|omega|one|ong|onion|onl|online|onyourside|ooo|open|oracle|orange|org|organic|orientexpress|origins|osaka|otsuka|ott|ovh|pa|page|pamperedchef|panasonic|panerai|paris|pars|partners|parts|party|passagens|pay|pccw|pe|pet|pf|pfizer|pg|ph|pharmacy|phd|philips|phone|photo|photography|photos|physio|piaget|pics|pictet|pictures|pid|pin|ping|pink|pioneer|pizza|pk|pl|place|play|playstation|plumbing|plus|pm|pn|pnc|pohl|poker|politie|porn|post|pr|pramerica|praxi|press|prime|pro|prod|productions|prof|progressive|promo|properties|property|protection|pru|prudential|ps|pt|pub|pw|pwc|py|qa|qpon|quebec|quest|qvc|racing|radio|raid|re|read|realestate|realtor|realty|recipes|red|redstone|redumbrella|rehab|reise|reisen|reit|reliance|ren|rent|rentals|repair|report|republican|rest|restaurant|review|reviews|rexroth|rich|richardli|ricoh|rightathome|ril|rio|rip|rmit|ro|rocher|rocks|rodeo|rogers|room|rs|rsvp|ru|ruhr|run|rw|rwe|ryukyu|sa|saarland|safe|safety|sakura|sale|salon|samsclub|samsung|sandvik|sandvikcoromant|sanofi|sap|sapo|sarl|sas|save|saxo|sb|sbi|sbs|sc|sca|scb|schaeffler|schmidt|scholarships|school|schule|schwarz|science|scjohnson|scor|scot|sd|se|search|seat|secure|security|seek|select|sener|services|ses|seven|sew|sex|sexy|sfr|sg|sh|shangrila|sharp|shaw|shell|shia|shiksha|shoes|shop|shopping|shouji|show|showtime|shriram|si|silk|sina|singles|site|sj|sk|ski|skin|sky|skype|sl|sling|sm|smart|smile|sn|sncf|so|soccer|social|softbank|software|sohu|solar|solutions|song|sony|soy|space|spiegel|spot|spreadbetting|sr|srl|srt|st|stada|staples|star|starhub|statebank|statefarm|statoil|stc|stcgroup|stockholm|storage|store|stream|studio|study|style|su|sucks|supplies|supply|support|surf|surgery|suzuki|sv|swatch|swiftcover|swiss|sx|sy|sydney|symantec|systems|sz|tab|taipei|talk|taobao|target|tatamotors|tatar|tattoo|tax|taxi|tc|tci|td|tdk|team|tech|technology|tel|telecity|telefonica|temasek|tennis|test|teva|tf|tg|th|thd|theater|theatre|theguardian|tiaa|tickets|tienda|tiffany|tips|tires|tirol|tj|tjmaxx|tjx|tk|tkmaxx|tl|tm|tmall|tn|to|today|tokyo|tools|top|toray|toshiba|total|tours|town|toyota|toys|tr|trade|trading|training|travel|travelchannel|travelers|travelersinsurance|trust|trv|tt|tube|tui|tunes|tushu|tv|tvs|tw|tz|ua|ubank|ubs|uconnect|ug|uk|unicom|university|uno|uol|ups|us|uy|uz|va|vacations|vana|vanguard|vc|ve|vegas|ventures|verisign|vermögensberater|vermögensberatung|versicherung|vet|vg|vi|viajes|video|vig|viking|villas|vin|vip|virgin|visa|vision|vista|vistaprint|viva|vivo|vlaanderen|vn|vodka|volkswagen|volvo|vote|voting|voto|voyage|vu|vuelos|wales|walmart|walter|wang|wanggou|warman|watch|watches|weather|weatherchannel|webcam|weber|website|wed|wedding|weibo|weir|wf|whoswho|wien|wiki|williamhill|win|windows|wine|winners|wme|wolterskluwer|woodside|work|works|world|wow|ws|wtc|wtf|xbox|xerox|xfinity|xihuan|xin|xn--11b4c3d|xn--1ck2e1b|xn--1qqw23a|xn--30rr7y|xn--3bst00m|xn--3ds443g|xn--3e0b707e|xn--3oq18vl8pn36a|xn--3pxu8k|xn--42c2d9a|xn--45brj9c|xn--45q11c|xn--4gbrim|xn--4gq48lf9j|xn--54b7fta0cc|xn--55qw42g|xn--55qx5d|xn--5su34j936bgsg|xn--5tzm5g|xn--6frz82g|xn--6qq986b3xl|xn--80adxhks|xn--80ao21a|xn--80aqecdr1a|xn--80asehdb|xn--80aswg|xn--8y0a063a|xn--90a3ac|xn--90ais|xn--9dbq2a|xn--9et52u|xn--9krt00a|xn--b4w605ferd|xn--bck1b9a5dre4c|xn--c1avg|xn--c2br7g|xn--cck2b3b|xn--cg4bki|xn--clchc0ea0b2g2a9gcd|xn--czr694b|xn--czrs0t|xn--czru2d|xn--d1acj3b|xn--d1alf|xn--e1a4c|xn--eckvdtc9d|xn--efvy88h|xn--estv75g|xn--fct429k|xn--fhbei|xn--fiq228c5hs|xn--fiq64b|xn--fiqs8s|xn--fiqz9s|xn--fjq720a|xn--flw351e|xn--fpcrj9c3d|xn--fzc2c9e2c|xn--fzys8d69uvgm|xn--g2xx48c|xn--gckr3f0f|xn--gecrj9c|xn--gk3at1e|xn--h2brj9c|xn--hxt814e|xn--i1b6b1a6a2e|xn--imr513n|xn--io0a7i|xn--j1aef|xn--j1amh|xn--j6w193g|xn--jlq61u9w7b|xn--jvr189m|xn--kcrx77d1x4a|xn--kprw13d|xn--kpry57d|xn--kpu716f|xn--kput3i|xn--l1acc|xn--lgbbat1ad8j|xn--mgb2ddes|xn--mgb9awbf|xn--mgba3a3ejt|xn--mgba3a4f16a|xn--mgba3a4fra|xn--mgba7c0bbn0a|xn--mgbaakc7dvf|xn--mgbaam7a8h|xn--mgbab2bd|xn--mgbai9a5eva00b|xn--mgbai9azgqp6j|xn--mgbayh7gpa|xn--mgbb9fbpob|xn--mgbbh1a71e|xn--mgbc0a9azcg|xn--mgbca7dzdo|xn--mgberp4a5d4a87g|xn--mgberp4a5d4ar|xn--mgbi4ecexp|xn--mgbpl2fh|xn--mgbqly7c0a67fbc|xn--mgbqly7cvafr|xn--mgbt3dhd|xn--mgbtf8fl|xn--mgbtx2b|xn--mgbx4cd0ab|xn--mix082f|xn--mix891f|xn--mk1bu44c|xn--mxtq1m|xn--ngbc5azd|xn--ngbe9e0a|xn--ngbrx|xn--nnx388a|xn--node|xn--nqv7f|xn--nqv7fs00ema|xn--nyqy26a|xn--o3cw4h|xn--ogbpf8fl|xn--p1acf|xn--p1ai|xn--pbt977c|xn--pgbs0dh|xn--pssy2u|xn--q9jyb4c|xn--qcka1pmc|xn--qxam|xn--rhqv96g|xn--rovu88b|xn--s9brj9c|xn--ses554g|xn--t60b56a|xn--tckwe|xn--tiq49xqyj|xn--unup4y|xn--vermgensberater-ctb|xn--vermgensberatung-pwb|xn--vhquv|xn--vuq861b|xn--w4r85el8fhu5dnra|xn--w4rs40l|xn--wgbh1c|xn--wgbl6a|xn--xhq521b|xn--xkc2al3hye2a|xn--xkc2dl3a5ee0h|xn--y9a3aq|xn--yfro4i67o|xn--ygbi2ammx|xn--zfr164b|xperia|xxx|xyz|yachts|yahoo|yamaxun|yandex|ye|yodobashi|yoga|yokohama|you|youtube|yt|yun|za|zappos|zara|zero|zip|zippo|zkey|zm|zone|zuerich|zw|ελ|бел|дети|ею|католик|ком|мкд|мон|москва|онлайн|орг|рус|рф|сайт|срб|укр|қаз|հայ|קום|ابوظبي|اتصالات|ارامكو|الاردن|الجزائر|السعودية|السعوديه|السعودیة|السعودیۃ|العليان|المغرب|اليمن|امارات|ايران|ایران|بازار|بيتك|بھارت|تونس|سودان|سوريا|سورية|شبكة|عراق|عرب|عمان|فلسطين|قطر|كاثوليك|كوم|مصر|مليسيا|موبايلي|موقع|همراه|پاكستان|پاکستان|कॉम|नेट|भारत|संगठन|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|คอม|ไทย|გე|みんな|クラウド|グーグル|コム|ストア|セール|ファッション|ポイント|一号店|世界|中信|中国|中國|中文网|企业|佛山|信息|健康|八卦|公司|公益|台湾|台灣|商城|商店|商标|嘉里|嘉里大酒店|在线|大众汽车|大拿|天主教|娱乐|家電|工行|广东|微博|慈善|我爱你|手机|手表|政务|政府|新加坡|新闻|时尚|書籍|机构|淡马锡|游戏|澳門|澳门|点看|珠宝|移动|组织机构|网址|网店|网站|网络|联通|臺灣|诺基亚|谷歌|购物|通販|集团|電訊盈科|飞利浦|食品|餐厅|香格里拉|香港|닷넷|닷컴|삼성|한국)(?-i)` otherScheme = `(?i)(bitcoin|file|magnet|mailto|sms|tel|xmpp)(?-i):` ) xurls-1.1.0/schemes.go000066400000000000000000000007621304220601100146360ustar00rootroot00000000000000// Copyright (c) 2015, Daniel Martí // See LICENSE for licensing information package xurls // SchemesNoAuthority is a sorted list of some well-known url schemes that are // followed by ":" instead of "://". Since these are more prone to false // positives, we limit their matching. var SchemesNoAuthority = []string{ `bitcoin`, // Bitcoin `file`, // Files `magnet`, // Torrent magnets `mailto`, // Mail `sms`, // SMS `tel`, // Telephone `xmpp`, // XMPP } xurls-1.1.0/tlds.go000066400000000000000000000401051304220601100141500ustar00rootroot00000000000000// Generated by tldsgen package xurls // TLDs is a sorted list of all public top-level domains. // // Sources: // * https://data.iana.org/TLD/tlds-alpha-by-domain.txt // * https://publicsuffix.org/list/effective_tld_names.dat var TLDs = []string{ `aaa`, `aarp`, `abarth`, `abb`, `abbott`, `abbvie`, `abc`, `able`, `abogado`, `abudhabi`, `ac`, `academy`, `accenture`, `accountant`, `accountants`, `aco`, `active`, `actor`, `ad`, `adac`, `ads`, `adult`, `ae`, `aeg`, `aero`, `aetna`, `af`, `afamilycompany`, `afl`, `africa`, `ag`, `agakhan`, `agency`, `ai`, `aig`, `aigo`, `airbus`, `airforce`, `airtel`, `akdn`, `al`, `alfaromeo`, `alibaba`, `alipay`, `allfinanz`, `allstate`, `ally`, `alsace`, `alstom`, `am`, `americanexpress`, `americanfamily`, `amex`, `amfam`, `amica`, `amsterdam`, `analytics`, `android`, `anquan`, `anz`, `ao`, `aol`, `apartments`, `app`, `apple`, `aq`, `aquarelle`, `ar`, `arab`, `aramco`, `archi`, `army`, `arpa`, `art`, `arte`, `as`, `asda`, `asia`, `associates`, `at`, `athleta`, `attorney`, `au`, `auction`, `audi`, `audible`, `audio`, `auspost`, `author`, `auto`, `autos`, `avianca`, `aw`, `aws`, `ax`, `axa`, `az`, `azure`, `ba`, `baby`, `baidu`, `banamex`, `bananarepublic`, `band`, `bank`, `bar`, `barcelona`, `barclaycard`, `barclays`, `barefoot`, `bargains`, `baseball`, `basketball`, `bauhaus`, `bayern`, `bb`, `bbc`, `bbt`, `bbva`, `bcg`, `bcn`, `bd`, `be`, `beats`, `beauty`, `beer`, `bentley`, `berlin`, `best`, `bestbuy`, `bet`, `bf`, `bg`, `bh`, `bharti`, `bi`, `bible`, `bid`, `bike`, `bing`, `bingo`, `bio`, `biz`, `bj`, `black`, `blackfriday`, `blanco`, `blockbuster`, `blog`, `bloomberg`, `blue`, `bm`, `bms`, `bmw`, `bn`, `bnl`, `bnpparibas`, `bo`, `boats`, `boehringer`, `bofa`, `bom`, `bond`, `boo`, `book`, `booking`, `boots`, `bosch`, `bostik`, `boston`, `bot`, `boutique`, `box`, `br`, `bradesco`, `bridgestone`, `broadway`, `broker`, `brother`, `brussels`, `bs`, `bt`, `budapest`, `bugatti`, `build`, `builders`, `business`, `buy`, `buzz`, `bv`, `bw`, `by`, `bz`, `bzh`, `ca`, `cab`, `cafe`, `cal`, `call`, `calvinklein`, `cam`, `camera`, `camp`, `cancerresearch`, `canon`, `capetown`, `capital`, `capitalone`, `car`, `caravan`, `cards`, `care`, `career`, `careers`, `cars`, `cartier`, `casa`, `case`, `caseih`, `cash`, `casino`, `cat`, `catering`, `catholic`, `cba`, `cbn`, `cbre`, `cbs`, `cc`, `cd`, `ceb`, `center`, `ceo`, `cern`, `cf`, `cfa`, `cfd`, `cg`, `ch`, `chanel`, `channel`, `chase`, `chat`, `cheap`, `chintai`, `chloe`, `christmas`, `chrome`, `chrysler`, `church`, `ci`, `cipriani`, `circle`, `cisco`, `citadel`, `citi`, `citic`, `city`, `cityeats`, `ck`, `cl`, `claims`, `cleaning`, `click`, `clinic`, `clinique`, `clothing`, `cloud`, `club`, `clubmed`, `cm`, `cn`, `co`, `coach`, `codes`, `coffee`, `college`, `cologne`, `com`, `comcast`, `commbank`, `community`, `company`, `compare`, `computer`, `comsec`, `condos`, `construction`, `consulting`, `contact`, `contractors`, `cooking`, `cookingchannel`, `cool`, `coop`, `corsica`, `country`, `coupon`, `coupons`, `courses`, `cr`, `credit`, `creditcard`, `creditunion`, `cricket`, `crown`, `crs`, `cruise`, `cruises`, `csc`, `cu`, `cuisinella`, `cv`, `cw`, `cx`, `cy`, `cymru`, `cyou`, `cz`, `dabur`, `dad`, `dance`, `data`, `date`, `dating`, `datsun`, `day`, `dclk`, `dds`, `de`, `deal`, `dealer`, `deals`, `degree`, `delivery`, `dell`, `deloitte`, `delta`, `democrat`, `dental`, `dentist`, `desi`, `design`, `dev`, `dhl`, `diamonds`, `diet`, `digital`, `direct`, `directory`, `discount`, `discover`, `dish`, `diy`, `dj`, `dk`, `dm`, `dnp`, `do`, `docs`, `doctor`, `dodge`, `dog`, `doha`, `domains`, `dot`, `download`, `drive`, `dtv`, `dubai`, `duck`, `dunlop`, `duns`, `dupont`, `durban`, `dvag`, `dvr`, `dwg`, `dz`, `earth`, `eat`, `ec`, `eco`, `edeka`, `edu`, `education`, `ee`, `eg`, `email`, `emerck`, `energy`, `engineer`, `engineering`, `enterprises`, `epost`, `epson`, `equipment`, `er`, `ericsson`, `erni`, `es`, `esq`, `estate`, `esurance`, `et`, `etisalat`, `eu`, `eurovision`, `eus`, `events`, `everbank`, `exchange`, `expert`, `exposed`, `express`, `extraspace`, `fage`, `fail`, `fairwinds`, `faith`, `family`, `fan`, `fans`, `farm`, `farmers`, `fashion`, `fast`, `fedex`, `feedback`, `ferrari`, `ferrero`, `fi`, `fiat`, `fidelity`, `fido`, `film`, `final`, `finance`, `financial`, `fire`, `firestone`, `firmdale`, `fish`, `fishing`, `fit`, `fitness`, `fj`, `fk`, `flickr`, `flights`, `flir`, `florist`, `flowers`, `fly`, `fm`, `fo`, `foo`, `food`, `foodnetwork`, `football`, `ford`, `forex`, `forsale`, `forum`, `foundation`, `fox`, `fr`, `free`, `fresenius`, `frl`, `frogans`, `frontdoor`, `frontier`, `ftr`, `fujitsu`, `fujixerox`, `fun`, `fund`, `furniture`, `futbol`, `fyi`, `ga`, `gal`, `gallery`, `gallo`, `gallup`, `game`, `games`, `gap`, `garden`, `gb`, `gbiz`, `gd`, `gdn`, `ge`, `gea`, `gent`, `genting`, `george`, `gf`, `gg`, `ggee`, `gh`, `gi`, `gift`, `gifts`, `gives`, `giving`, `gl`, `glade`, `glass`, `gle`, `global`, `globo`, `gm`, `gmail`, `gmbh`, `gmo`, `gmx`, `gn`, `godaddy`, `gold`, `goldpoint`, `golf`, `goo`, `goodhands`, `goodyear`, `goog`, `google`, `gop`, `got`, `gov`, `gp`, `gq`, `gr`, `grainger`, `graphics`, `gratis`, `green`, `gripe`, `grocery`, `group`, `gs`, `gt`, `gu`, `guardian`, `gucci`, `guge`, `guide`, `guitars`, `guru`, `gw`, `gy`, `hair`, `hamburg`, `hangout`, `haus`, `hbo`, `hdfc`, `hdfcbank`, `health`, `healthcare`, `help`, `helsinki`, `here`, `hermes`, `hgtv`, `hiphop`, `hisamitsu`, `hitachi`, `hiv`, `hk`, `hkt`, `hm`, `hn`, `hockey`, `holdings`, `holiday`, `homedepot`, `homegoods`, `homes`, `homesense`, `honda`, `honeywell`, `horse`, `hospital`, `host`, `hosting`, `hot`, `hoteles`, `hotels`, `hotmail`, `house`, `how`, `hr`, `hsbc`, `ht`, `htc`, `hu`, `hughes`, `hyatt`, `hyundai`, `ibm`, `icbc`, `ice`, `icu`, `id`, `ie`, `ieee`, `ifm`, `iinet`, `ikano`, `il`, `im`, `imamat`, `imdb`, `immo`, `immobilien`, `in`, `industries`, `infiniti`, `info`, `ing`, `ink`, `institute`, `insurance`, `insure`, `int`, `intel`, `international`, `intuit`, `investments`, `io`, `ipiranga`, `iq`, `ir`, `irish`, `is`, `iselect`, `ismaili`, `ist`, `istanbul`, `it`, `itau`, `itv`, `iveco`, `iwc`, `jaguar`, `java`, `jcb`, `jcp`, `je`, `jeep`, `jetzt`, `jewelry`, `jio`, `jlc`, `jll`, `jm`, `jmp`, `jnj`, `jo`, `jobs`, `joburg`, `jot`, `joy`, `jp`, `jpmorgan`, `jprs`, `juegos`, `juniper`, `kaufen`, `kddi`, `ke`, `kerryhotels`, `kerrylogistics`, `kerryproperties`, `kfh`, `kg`, `kh`, `ki`, `kia`, `kim`, `kinder`, `kindle`, `kitchen`, `kiwi`, `km`, `kn`, `koeln`, `komatsu`, `kosher`, `kp`, `kpmg`, `kpn`, `kr`, `krd`, `kred`, `kuokgroup`, `kw`, `ky`, `kyoto`, `kz`, `la`, `lacaixa`, `ladbrokes`, `lamborghini`, `lamer`, `lancaster`, `lancia`, `lancome`, `land`, `landrover`, `lanxess`, `lasalle`, `lat`, `latino`, `latrobe`, `law`, `lawyer`, `lb`, `lc`, `lds`, `lease`, `leclerc`, `lefrak`, `legal`, `lego`, `lexus`, `lgbt`, `li`, `liaison`, `lidl`, `life`, `lifeinsurance`, `lifestyle`, `lighting`, `like`, `lilly`, `limited`, `limo`, `lincoln`, `linde`, `link`, `lipsy`, `live`, `living`, `lixil`, `lk`, `loan`, `loans`, `locker`, `locus`, `loft`, `lol`, `london`, `lotte`, `lotto`, `love`, `lpl`, `lplfinancial`, `lr`, `ls`, `lt`, `ltd`, `ltda`, `lu`, `lundbeck`, `lupin`, `luxe`, `luxury`, `lv`, `ly`, `ma`, `macys`, `madrid`, `maif`, `maison`, `makeup`, `man`, `management`, `mango`, `map`, `market`, `marketing`, `markets`, `marriott`, `marshalls`, `maserati`, `mattel`, `mba`, `mc`, `mcd`, `mcdonalds`, `mckinsey`, `md`, `me`, `med`, `media`, `meet`, `melbourne`, `meme`, `memorial`, `men`, `menu`, `meo`, `merckmsd`, `metlife`, `mg`, `mh`, `miami`, `microsoft`, `mil`, `mini`, `mint`, `mit`, `mitsubishi`, `mk`, `ml`, `mlb`, `mls`, `mm`, `mma`, `mn`, `mo`, `mobi`, `mobile`, `mobily`, `moda`, `moe`, `moi`, `mom`, `monash`, `money`, `monster`, `montblanc`, `mopar`, `mormon`, `mortgage`, `moscow`, `moto`, `motorcycles`, `mov`, `movie`, `movistar`, `mp`, `mq`, `mr`, `ms`, `msd`, `mt`, `mtn`, `mtpc`, `mtr`, `mu`, `museum`, `mutual`, `mutuelle`, `mv`, `mw`, `mx`, `my`, `mz`, `na`, `nab`, `nadex`, `nagoya`, `name`, `nationwide`, `natura`, `navy`, `nba`, `nc`, `ne`, `nec`, `net`, `netbank`, `netflix`, `network`, `neustar`, `new`, `newholland`, `news`, `next`, `nextdirect`, `nexus`, `nf`, `nfl`, `ng`, `ngo`, `nhk`, `ni`, `nico`, `nike`, `nikon`, `ninja`, `nissan`, `nissay`, `nl`, `no`, `nokia`, `northwesternmutual`, `norton`, `now`, `nowruz`, `nowtv`, `np`, `nr`, `nra`, `nrw`, `ntt`, `nu`, `nyc`, `nz`, `obi`, `observer`, `off`, `office`, `okinawa`, `olayan`, `olayangroup`, `oldnavy`, `ollo`, `om`, `omega`, `one`, `ong`, `onion`, `onl`, `online`, `onyourside`, `ooo`, `open`, `oracle`, `orange`, `org`, `organic`, `orientexpress`, `origins`, `osaka`, `otsuka`, `ott`, `ovh`, `pa`, `page`, `pamperedchef`, `panasonic`, `panerai`, `paris`, `pars`, `partners`, `parts`, `party`, `passagens`, `pay`, `pccw`, `pe`, `pet`, `pf`, `pfizer`, `pg`, `ph`, `pharmacy`, `phd`, `philips`, `phone`, `photo`, `photography`, `photos`, `physio`, `piaget`, `pics`, `pictet`, `pictures`, `pid`, `pin`, `ping`, `pink`, `pioneer`, `pizza`, `pk`, `pl`, `place`, `play`, `playstation`, `plumbing`, `plus`, `pm`, `pn`, `pnc`, `pohl`, `poker`, `politie`, `porn`, `post`, `pr`, `pramerica`, `praxi`, `press`, `prime`, `pro`, `prod`, `productions`, `prof`, `progressive`, `promo`, `properties`, `property`, `protection`, `pru`, `prudential`, `ps`, `pt`, `pub`, `pw`, `pwc`, `py`, `qa`, `qpon`, `quebec`, `quest`, `qvc`, `racing`, `radio`, `raid`, `re`, `read`, `realestate`, `realtor`, `realty`, `recipes`, `red`, `redstone`, `redumbrella`, `rehab`, `reise`, `reisen`, `reit`, `reliance`, `ren`, `rent`, `rentals`, `repair`, `report`, `republican`, `rest`, `restaurant`, `review`, `reviews`, `rexroth`, `rich`, `richardli`, `ricoh`, `rightathome`, `ril`, `rio`, `rip`, `rmit`, `ro`, `rocher`, `rocks`, `rodeo`, `rogers`, `room`, `rs`, `rsvp`, `ru`, `ruhr`, `run`, `rw`, `rwe`, `ryukyu`, `sa`, `saarland`, `safe`, `safety`, `sakura`, `sale`, `salon`, `samsclub`, `samsung`, `sandvik`, `sandvikcoromant`, `sanofi`, `sap`, `sapo`, `sarl`, `sas`, `save`, `saxo`, `sb`, `sbi`, `sbs`, `sc`, `sca`, `scb`, `schaeffler`, `schmidt`, `scholarships`, `school`, `schule`, `schwarz`, `science`, `scjohnson`, `scor`, `scot`, `sd`, `se`, `search`, `seat`, `secure`, `security`, `seek`, `select`, `sener`, `services`, `ses`, `seven`, `sew`, `sex`, `sexy`, `sfr`, `sg`, `sh`, `shangrila`, `sharp`, `shaw`, `shell`, `shia`, `shiksha`, `shoes`, `shop`, `shopping`, `shouji`, `show`, `showtime`, `shriram`, `si`, `silk`, `sina`, `singles`, `site`, `sj`, `sk`, `ski`, `skin`, `sky`, `skype`, `sl`, `sling`, `sm`, `smart`, `smile`, `sn`, `sncf`, `so`, `soccer`, `social`, `softbank`, `software`, `sohu`, `solar`, `solutions`, `song`, `sony`, `soy`, `space`, `spiegel`, `spot`, `spreadbetting`, `sr`, `srl`, `srt`, `st`, `stada`, `staples`, `star`, `starhub`, `statebank`, `statefarm`, `statoil`, `stc`, `stcgroup`, `stockholm`, `storage`, `store`, `stream`, `studio`, `study`, `style`, `su`, `sucks`, `supplies`, `supply`, `support`, `surf`, `surgery`, `suzuki`, `sv`, `swatch`, `swiftcover`, `swiss`, `sx`, `sy`, `sydney`, `symantec`, `systems`, `sz`, `tab`, `taipei`, `talk`, `taobao`, `target`, `tatamotors`, `tatar`, `tattoo`, `tax`, `taxi`, `tc`, `tci`, `td`, `tdk`, `team`, `tech`, `technology`, `tel`, `telecity`, `telefonica`, `temasek`, `tennis`, `teva`, `tf`, `tg`, `th`, `thd`, `theater`, `theatre`, `theguardian`, `tiaa`, `tickets`, `tienda`, `tiffany`, `tips`, `tires`, `tirol`, `tj`, `tjmaxx`, `tjx`, `tk`, `tkmaxx`, `tl`, `tm`, `tmall`, `tn`, `to`, `today`, `tokyo`, `tools`, `top`, `toray`, `toshiba`, `total`, `tours`, `town`, `toyota`, `toys`, `tr`, `trade`, `trading`, `training`, `travel`, `travelchannel`, `travelers`, `travelersinsurance`, `trust`, `trv`, `tt`, `tube`, `tui`, `tunes`, `tushu`, `tv`, `tvs`, `tw`, `tz`, `ua`, `ubank`, `ubs`, `uconnect`, `ug`, `uk`, `unicom`, `university`, `uno`, `uol`, `ups`, `us`, `uy`, `uz`, `va`, `vacations`, `vana`, `vanguard`, `vc`, `ve`, `vegas`, `ventures`, `verisign`, `vermögensberater`, `vermögensberatung`, `versicherung`, `vet`, `vg`, `vi`, `viajes`, `video`, `vig`, `viking`, `villas`, `vin`, `vip`, `virgin`, `visa`, `vision`, `vista`, `vistaprint`, `viva`, `vivo`, `vlaanderen`, `vn`, `vodka`, `volkswagen`, `volvo`, `vote`, `voting`, `voto`, `voyage`, `vu`, `vuelos`, `wales`, `walmart`, `walter`, `wang`, `wanggou`, `warman`, `watch`, `watches`, `weather`, `weatherchannel`, `webcam`, `weber`, `website`, `wed`, `wedding`, `weibo`, `weir`, `wf`, `whoswho`, `wien`, `wiki`, `williamhill`, `win`, `windows`, `wine`, `winners`, `wme`, `wolterskluwer`, `woodside`, `work`, `works`, `world`, `wow`, `ws`, `wtc`, `wtf`, `xbox`, `xerox`, `xfinity`, `xihuan`, `xin`, `xperia`, `xxx`, `xyz`, `yachts`, `yahoo`, `yamaxun`, `yandex`, `ye`, `yodobashi`, `yoga`, `yokohama`, `you`, `youtube`, `yt`, `yun`, `za`, `zappos`, `zara`, `zero`, `zip`, `zippo`, `zm`, `zone`, `zuerich`, `zw`, `ελ`, `бел`, `дети`, `ею`, `католик`, `ком`, `мкд`, `мон`, `москва`, `онлайн`, `орг`, `рус`, `рф`, `сайт`, `срб`, `укр`, `қаз`, `հայ`, `קום`, `ابوظبي`, `اتصالات`, `ارامكو`, `الاردن`, `الجزائر`, `السعودية`, `السعوديه`, `السعودیة`, `السعودیۃ`, `العليان`, `المغرب`, `اليمن`, `امارات`, `ايران`, `ایران`, `بازار`, `بيتك`, `بھارت`, `تونس`, `سودان`, `سوريا`, `سورية`, `شبكة`, `عراق`, `عرب`, `عمان`, `فلسطين`, `قطر`, `كاثوليك`, `كوم`, `مصر`, `مليسيا`, `موبايلي`, `موقع`, `همراه`, `پاكستان`, `پاکستان`, `कॉम`, `नेट`, `भारत`, `संगठन`, `বাংলা`, `ভারত`, `ਭਾਰਤ`, `ભારત`, `இந்தியா`, `இலங்கை`, `சிங்கப்பூர்`, `భారత్`, `ලංකා`, `คอม`, `ไทย`, `გე`, `みんな`, `クラウド`, `グーグル`, `コム`, `ストア`, `セール`, `ファッション`, `ポイント`, `一号店`, `世界`, `中信`, `中国`, `中國`, `中文网`, `企业`, `佛山`, `信息`, `健康`, `八卦`, `公司`, `公益`, `台湾`, `台灣`, `商城`, `商店`, `商标`, `嘉里`, `嘉里大酒店`, `在线`, `大众汽车`, `大拿`, `天主教`, `娱乐`, `家電`, `工行`, `广东`, `微博`, `慈善`, `我爱你`, `手机`, `手表`, `政务`, `政府`, `新加坡`, `新闻`, `时尚`, `書籍`, `机构`, `淡马锡`, `游戏`, `澳門`, `澳门`, `点看`, `珠宝`, `移动`, `组织机构`, `网址`, `网店`, `网站`, `网络`, `联通`, `臺灣`, `诺基亚`, `谷歌`, `购物`, `通販`, `集团`, `電訊盈科`, `飞利浦`, `食品`, `餐厅`, `香格里拉`, `香港`, `닷넷`, `닷컴`, `삼성`, `한국`, } xurls-1.1.0/tlds_pseudo.go000066400000000000000000000015041304220601100155270ustar00rootroot00000000000000// Copyright (c) 2015, Daniel Martí // See LICENSE for licensing information package xurls // PseudoTLDs is a sorted list of some widely used unofficial TLDs. // // Sources: // * https://en.wikipedia.org/wiki/Pseudo-top-level_domain // * https://en.wikipedia.org/wiki/Category:Pseudo-top-level_domains // * https://tools.ietf.org/html/draft-grothoff-iesg-special-use-p2p-names-00 // * https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml var PseudoTLDs = []string{ `bit`, // Namecoin `example`, // Example domain `exit`, // Tor exit node `gnu`, // GNS by public key `i2p`, // I2P network `invalid`, // Invalid domain `local`, // Local network `localhost`, // Local network `test`, // Test domain `zkey`, // GNS domain name } xurls-1.1.0/xurls.go000066400000000000000000000050721304220601100143630ustar00rootroot00000000000000// Copyright (c) 2015, Daniel Martí // See LICENSE for licensing information // Package xurls extracts urls from plain text using regular expressions. package xurls import "regexp" //go:generate go run generate/tldsgen/main.go //go:generate go run generate/regexgen/main.go const ( letter = `\p{L}` mark = `\p{M}` number = `\p{N}` iriChar = letter + mark + number currency = `\p{Sc}` otherSymb = `\p{So}` endChar = iriChar + `/\-+_&~*%=#` + currency + otherSymb midChar = endChar + `@.,:;'?!|` wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)` wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]` wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}` wellAll = wellParen + `|` + wellBrack + `|` + wellBrace pathCont = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+` comScheme = `[a-zA-Z][a-zA-Z.\-+]*://` scheme = `(` + comScheme + `|` + otherScheme + `)` iri = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?` domain = `(` + iri + `\.)+` octet = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])` ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b` ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:` ipAddr = `(` + ipv4Addr + `|` + ipv6Addr + `)` site = domain + gtld hostName = `(` + site + `|` + ipAddr + `)` port = `(:[0-9]*)?` path = `(/|/` + pathCont + `?|\b|$)` webURL = hostName + port + path strict = `(\b` + scheme + pathCont + `)` relaxed = `(` + strict + `|` + webURL + `)` ) var ( // Relaxed matches all the urls it can find. Relaxed = regexp.MustCompile(relaxed) // Strict only matches urls with a scheme to avoid false positives. Strict = regexp.MustCompile(strict) ) func init() { Relaxed.Longest() Strict.Longest() } // StrictMatchingScheme produces a regexp that matches urls like Strict but // whose scheme matches the given regular expression. func StrictMatchingScheme(exp string) (*regexp.Regexp, error) { strictMatching := `(\b(?i)(` + exp + `)(?-i)` + pathCont + `)` re, err := regexp.Compile(strictMatching) if err != nil { return nil, err } re.Longest() return re, nil } xurls-1.1.0/xurls_test.go000066400000000000000000000222461304220601100154240ustar00rootroot00000000000000// Copyright (c) 2015, Daniel Martí // See LICENSE for licensing information package xurls import ( "fmt" "regexp" "testing" ) type testCase struct { in string want interface{} } func wantStr(in string, want interface{}) string { switch x := want.(type) { case string: return x case bool: if x { return in } } return "" } func doTest(t *testing.T, name string, re *regexp.Regexp, cases []testCase) { for i, c := range cases { t.Run(fmt.Sprintf("%s/%03d", name, i), func(t *testing.T) { got := re.FindString(c.in) want := wantStr(c.in, c.want) if got != want { t.Errorf(`%s.FindString("%s") got "%s", want "%s"`, name, c.in, got, want) } }) } } var constantTestCases = []testCase{ {``, nil}, {` `, nil}, {`:`, nil}, {`::`, nil}, {`:::`, nil}, {`::::`, nil}, {`.`, nil}, {`..`, nil}, {`...`, nil}, {`1.1`, nil}, {`.1.`, nil}, {`1.1.1`, nil}, {`1:1`, nil}, {`:1:`, nil}, {`1:1:1`, nil}, {`://`, nil}, {`foo`, nil}, {`foo:`, nil}, {`mailto:`, nil}, {`foo://`, nil}, {`http://`, nil}, {`http:// foo`, nil}, {`http:// foo`, nil}, {`:foo`, nil}, {`://foo`, nil}, {`foorandom:bar`, nil}, {`foo.randombar`, nil}, {`zzz.`, nil}, {`.zzz`, nil}, {`zzz.zzz`, nil}, {`/some/path`, nil}, {`rel/path`, nil}, {`localhost`, nil}, {`com`, nil}, {`.com`, nil}, {`com.`, nil}, {`http`, nil}, {`http://foo`, true}, {`http://FOO`, true}, {`http://FAÀ`, true}, {`https://localhost`, true}, {`git+https://localhost`, true}, {`foo.bar://localhost`, true}, {`foo-bar://localhost`, true}, {`mailto:foo`, true}, {`MAILTO:foo`, true}, {`sms:123`, true}, {`xmpp:foo@bar`, true}, {`bitcoin:Addr23?amount=1&message=foo`, true}, {`http://foo.com`, true}, {`http://foo.co.uk`, true}, {`http://foo.random`, true}, {` http://foo.com/bar `, `http://foo.com/bar`}, {` http://foo.com/bar more`, `http://foo.com/bar`}, {``, `http://foo.com/bar`}, {`more`, `http://foo.com/bar`}, {`.http://foo.com/bar.`, `http://foo.com/bar`}, {`.http://foo.com/bar.more`, `http://foo.com/bar.more`}, {`,http://foo.com/bar,`, `http://foo.com/bar`}, {`,http://foo.com/bar,more`, `http://foo.com/bar,more`}, {`(http://foo.com/bar)`, `http://foo.com/bar`}, {`(http://foo.com/bar)more`, `http://foo.com/bar`}, {`[http://foo.com/bar]`, `http://foo.com/bar`}, {`[http://foo.com/bar]more`, `http://foo.com/bar`}, {`'http://foo.com/bar'`, `http://foo.com/bar`}, {`'http://foo.com/bar'more`, `http://foo.com/bar'more`}, {`"http://foo.com/bar"`, `http://foo.com/bar`}, {`http://a.b/a0/-+_&~*%=#@.,:;'?!|[]()a`, true}, {`http://a.b/a0/$€¥`, true}, {`http://✪foo.bar/pa✪th©more`, true}, {`http://foo.bar/path/`, true}, {`http://foo.bar/path-`, true}, {`http://foo.bar/path+`, true}, {`http://foo.bar/path_`, true}, {`http://foo.bar/path&`, true}, {`http://foo.bar/path~`, true}, {`http://foo.bar/path*`, true}, {`http://foo.bar/path%`, true}, {`http://foo.bar/path=`, true}, {`http://foo.bar/path#`, true}, {`http://foo.bar/path.`, `http://foo.bar/path`}, {`http://foo.bar/path,`, `http://foo.bar/path`}, {`http://foo.bar/path:`, `http://foo.bar/path`}, {`http://foo.bar/path;`, `http://foo.bar/path`}, {`http://foo.bar/path'`, `http://foo.bar/path`}, {`http://foo.bar/path?`, `http://foo.bar/path`}, {`http://foo.bar/path!`, `http://foo.bar/path`}, {`http://foo.bar/path@`, `http://foo.bar/path`}, {`http://foo.bar/path|`, `http://foo.bar/path`}, {`http://foo.bar/path<`, `http://foo.bar/path`}, {`http://foo.bar/path`, `foo.com/bar`}, {`more`, `foo.com/bar`}, {`,foo.com/bar.`, `foo.com/bar`}, {`,foo.com/bar.more`, `foo.com/bar.more`}, {`,foo.com/bar,`, `foo.com/bar`}, {`,foo.com/bar,more`, `foo.com/bar,more`}, {`(foo.com/bar)`, `foo.com/bar`}, {`"foo.com/bar'`, `foo.com/bar`}, {`"foo.com/bar'more`, `foo.com/bar'more`}, {`"foo.com/bar"`, `foo.com/bar`}, {`what is foo.com?`, `foo.com`}, {`the foo.com!`, `foo.com`}, {`foo@bar`, nil}, {`foo@bar.a`, nil}, {`foo@bar.com`, "bar.com"}, {`foo@sub.bar.com`, "sub.bar.com"}, {`foo@中国.中国`, "中国.中国"}, }) doTest(t, "Strict2", Strict, []testCase{ {`http:// foo.com`, nil}, {`foo.a`, nil}, {`foo.com`, nil}, {`foo.com/`, nil}, {`1.1.1.1`, nil}, {`3ffe:2a00:100:7031::1`, nil}, {`test.foo.com:8080/path`, nil}, {`foo@bar.com`, nil}, }) } func TestStrictMatchingSchemeError(t *testing.T) { for _, c := range []struct { exp string wantErr bool }{ {`http://`, false}, {`https?://`, false}, {`http://|mailto:`, false}, {`http://(`, true}, } { _, err := StrictMatchingScheme(c.exp) if c.wantErr && err == nil { t.Errorf(`StrictMatchingScheme("%s") did not error as expected`, c.exp) } else if !c.wantErr && err != nil { t.Errorf(`StrictMatchingScheme("%s") unexpectedly errored`, c.exp) } } } func TestStrictMatchingScheme(t *testing.T) { strictMatching, _ := StrictMatchingScheme("http://|ftps?://|mailto:") doTest(t, "StrictMatchingScheme", strictMatching, []testCase{ {`foo.com`, nil}, {`foo@bar.com`, nil}, {`http://foo`, true}, {`Http://foo`, true}, {`https://foo`, nil}, {`ftp://foo`, true}, {`ftps://foo`, true}, {`mailto:foo`, true}, {`MAILTO:foo`, true}, {`sms:123`, nil}, }) } func bench(b *testing.B, re *regexp.Regexp, str string) { for i := 0; i < b.N; i++ { re.FindAllString(str, -1) } } func BenchmarkStrictEmpty(b *testing.B) { bench(b, Strict, "foo") } func BenchmarkStrictSingle(b *testing.B) { bench(b, Strict, "http://foo.foo foo.com") } func BenchmarkStrictMany(b *testing.B) { bench(b, Strict, ` foo bar http://foo.foo foo.com bitcoin:address ftp:// xmpp:foo@bar.com`) } func BenchmarkRelaxedEmpty(b *testing.B) { bench(b, Relaxed, "foo") } func BenchmarkRelaxedSingle(b *testing.B) { bench(b, Relaxed, "http://foo.foo foo.com") } func BenchmarkRelaxedMany(b *testing.B) { bench(b, Relaxed, ` foo bar http://foo.foo foo.com bitcoin:address ftp:// xmpp:foo@bar.com`) }