pax_global_header00006660000000000000000000000064135445621410014517gustar00rootroot0000000000000052 comment=ae3ad266dd558a393a5c37332eb6e9f28bb7a177 go-unicodeclass-0.0.1/000077500000000000000000000000001354456214100145745ustar00rootroot00000000000000go-unicodeclass-0.0.1/README.md000066400000000000000000000003641354456214100160560ustar00rootroot00000000000000# go-unicodeclass Unicode class package ## Usage ``` fmt.Println(unicodeclass.Is('世')) // CJKIdeographs ``` ## Installation ``` $ go get github.com/mattn/go-unicodeclass ``` ## License MIT ## Author Yasuhiro Matsumoto (a.k.a. mattn) go-unicodeclass-0.0.1/class_string.go000066400000000000000000000021521354456214100176160ustar00rootroot00000000000000// Code generated by "stringer -type Class"; DO NOT EDIT package unicodeclass import "fmt" const ( _Class_name_0 = "BlankPunctationWordEmoji" _Class_name_1 = "SuperScript" _Class_name_2 = "SubScript" _Class_name_3 = "Braille" _Class_name_4 = "Hiragana" _Class_name_5 = "Katakana" _Class_name_6 = "CJKIdeographs" _Class_name_7 = "HungulSyllables" ) var ( _Class_index_0 = [...]uint8{0, 5, 15, 19, 24} _Class_index_1 = [...]uint8{0, 11} _Class_index_2 = [...]uint8{0, 9} _Class_index_3 = [...]uint8{0, 7} _Class_index_4 = [...]uint8{0, 8} _Class_index_5 = [...]uint8{0, 8} _Class_index_6 = [...]uint8{0, 13} _Class_index_7 = [...]uint8{0, 15} ) func (i Class) String() string { switch { case 0 <= i && i <= 3: return _Class_name_0[_Class_index_0[i]:_Class_index_0[i+1]] case i == 8304: return _Class_name_1 case i == 8320: return _Class_name_2 case i == 10240: return _Class_name_3 case i == 12352: return _Class_name_4 case i == 12448: return _Class_name_5 case i == 19968: return _Class_name_6 case i == 44032: return _Class_name_7 default: return fmt.Sprintf("Class(%d)", i) } } go-unicodeclass-0.0.1/cmd/000077500000000000000000000000001354456214100153375ustar00rootroot00000000000000go-unicodeclass-0.0.1/cmd/words/000077500000000000000000000000001354456214100164755ustar00rootroot00000000000000go-unicodeclass-0.0.1/cmd/words/main.go000066400000000000000000000004311354456214100177460ustar00rootroot00000000000000package main import ( "bufio" "fmt" "log" "os" "github.com/mattn/go-unicodeclass" ) func main() { scan := bufio.NewScanner(os.Stdin) scan.Split(unicodeclass.SplitClass) for scan.Scan() { fmt.Println(scan.Text()) } if scan.Err() != nil { log.Fatal(scan.Err()) } } go-unicodeclass-0.0.1/unicodeclass.go000066400000000000000000000142461354456214100176060ustar00rootroot00000000000000package unicodeclass import ( "bufio" "strings" "unicode" "unicode/utf8" ) type Class int const ( Invalid Class = -1 Blank Class = 0 Punctation Class = 1 Word Class = 2 Emoji Class = 3 SuperScript Class = 0x2070 SubScript Class = 0x2080 Braille Class = 0x2800 Hiragana Class = 0x3040 Katakana Class = 0x30a0 CJKIdeographs Class = 0x4e00 HungulSyllables Class = 0xac00 ) var classes = []struct { first rune last rune value int }{ {0x037e, 0x037e, 1}, /* Greek question mark */ {0x0387, 0x0387, 1}, /* Greek ano teleia */ {0x055a, 0x055f, 1}, /* Armenian punctuation */ {0x0589, 0x0589, 1}, /* Armenian full stop */ {0x05be, 0x05be, 1}, {0x05c0, 0x05c0, 1}, {0x05c3, 0x05c3, 1}, {0x05f3, 0x05f4, 1}, {0x060c, 0x060c, 1}, {0x061b, 0x061b, 1}, {0x061f, 0x061f, 1}, {0x066a, 0x066d, 1}, {0x06d4, 0x06d4, 1}, {0x0700, 0x070d, 1}, /* Syriac punctuation */ {0x0964, 0x0965, 1}, {0x0970, 0x0970, 1}, {0x0df4, 0x0df4, 1}, {0x0e4f, 0x0e4f, 1}, {0x0e5a, 0x0e5b, 1}, {0x0f04, 0x0f12, 1}, {0x0f3a, 0x0f3d, 1}, {0x0f85, 0x0f85, 1}, {0x104a, 0x104f, 1}, /* Myanmar punctuation */ {0x10fb, 0x10fb, 1}, /* Georgian punctuation */ {0x1361, 0x1368, 1}, /* Ethiopic punctuation */ {0x166d, 0x166e, 1}, /* Canadian Syl. punctuation */ {0x1680, 0x1680, 0}, {0x169b, 0x169c, 1}, {0x16eb, 0x16ed, 1}, {0x1735, 0x1736, 1}, {0x17d4, 0x17dc, 1}, /* Khmer punctuation */ {0x1800, 0x180a, 1}, /* Mongolian punctuation */ {0x2000, 0x200b, 0}, /* spaces */ {0x200c, 0x2027, 1}, /* punctuation and symbols */ {0x2028, 0x2029, 0}, {0x202a, 0x202e, 1}, /* punctuation and symbols */ {0x202f, 0x202f, 0}, {0x2030, 0x205e, 1}, /* punctuation and symbols */ {0x205f, 0x205f, 0}, {0x2060, 0x27ff, 1}, /* punctuation and symbols */ {0x2070, 0x207f, 0x2070}, /* superscript */ {0x2080, 0x2094, 0x2080}, /* subscript */ {0x20a0, 0x27ff, 1}, /* all kinds of symbols */ {0x2800, 0x28ff, 0x2800}, /* braille */ {0x2900, 0x2998, 1}, /* arrows, brackets, etc. */ {0x29d8, 0x29db, 1}, {0x29fc, 0x29fd, 1}, {0x2e00, 0x2e7f, 1}, /* supplemental punctuation */ {0x3000, 0x3000, 0}, /* ideographic space */ {0x3001, 0x3020, 1}, /* ideographic punctuation */ {0x3030, 0x3030, 1}, {0x303d, 0x303d, 1}, {0x3040, 0x309f, 0x3040}, /* Hiragana */ {0x30a0, 0x30ff, 0x30a0}, /* Katakana */ {0x3300, 0x9fff, 0x4e00}, /* CJK Ideographs */ {0xac00, 0xd7a3, 0xac00}, /* Hangul Syllables */ {0xf900, 0xfaff, 0x4e00}, /* CJK Ideographs */ {0xfd3e, 0xfd3f, 1}, {0xfe30, 0xfe6b, 1}, /* punctuation forms */ {0xff00, 0xff0f, 1}, /* half/fullwidth ASCII */ {0xff1a, 0xff20, 1}, /* half/fullwidth ASCII */ {0xff3b, 0xff40, 1}, /* half/fullwidth ASCII */ {0xff5b, 0xff65, 1}, /* half/fullwidth ASCII */ {0x20000, 0x2a6df, 0x4e00}, /* CJK Ideographs */ {0x2a700, 0x2b73f, 0x4e00}, /* CJK Ideographs */ {0x2b740, 0x2b81f, 0x4e00}, /* CJK Ideographs */ {0x2f800, 0x2fa1f, 0x4e00}, /* CJK Ideographs */ } var emoji = []struct { first rune last rune }{ {0x203c, 0x203c}, {0x2049, 0x2049}, {0x2122, 0x2122}, {0x2139, 0x2139}, {0x2194, 0x2199}, {0x21a9, 0x21aa}, {0x231a, 0x231b}, {0x2328, 0x2328}, {0x23cf, 0x23cf}, {0x23e9, 0x23f3}, {0x24c2, 0x24c2}, {0x25aa, 0x25ab}, {0x25b6, 0x25b6}, {0x25c0, 0x25c0}, {0x25fb, 0x25fe}, {0x2600, 0x2604}, {0x260e, 0x260e}, {0x2611, 0x2611}, {0x2614, 0x2615}, {0x2618, 0x2618}, {0x261d, 0x261d}, {0x2620, 0x2620}, {0x2622, 0x2623}, {0x2626, 0x2626}, {0x262a, 0x262a}, {0x262e, 0x262f}, {0x2638, 0x263a}, {0x2648, 0x2653}, {0x2660, 0x2660}, {0x2663, 0x2663}, {0x2665, 0x2666}, {0x2668, 0x2668}, {0x267b, 0x267b}, {0x267f, 0x267f}, {0x2692, 0x2694}, {0x2696, 0x2697}, {0x2699, 0x2699}, {0x269b, 0x269c}, {0x26a0, 0x26a1}, {0x26aa, 0x26ab}, {0x26b0, 0x26b1}, {0x26bd, 0x26be}, {0x26c4, 0x26c5}, {0x26c8, 0x26c8}, {0x26ce, 0x26cf}, {0x26d1, 0x26d1}, {0x26d3, 0x26d4}, {0x26e9, 0x26ea}, {0x26f0, 0x26f5}, {0x26f7, 0x26fa}, {0x26fd, 0x26fd}, {0x2702, 0x2702}, {0x2705, 0x2705}, {0x2708, 0x270d}, {0x270f, 0x270f}, {0x2712, 0x2712}, {0x2714, 0x2714}, {0x2716, 0x2716}, {0x271d, 0x271d}, {0x2721, 0x2721}, {0x2728, 0x2728}, {0x2733, 0x2734}, {0x2744, 0x2744}, {0x2747, 0x2747}, {0x274c, 0x274c}, {0x274e, 0x274e}, {0x2753, 0x2755}, {0x2757, 0x2757}, {0x2763, 0x2764}, {0x2795, 0x2797}, {0x27a1, 0x27a1}, {0x27b0, 0x27b0}, {0x27bf, 0x27bf}, {0x2934, 0x2935}, {0x2b05, 0x2b07}, {0x2b1b, 0x2b1c}, {0x2b50, 0x2b50}, {0x2b55, 0x2b55}, {0x3030, 0x3030}, {0x303d, 0x303d}, {0x3297, 0x3297}, {0x3299, 0x3299}, {0x1f004, 0x1f004}, {0x1f0cf, 0x1f0cf}, {0x1f170, 0x1f171}, {0x1f17e, 0x1f17f}, {0x1f18e, 0x1f18e}, {0x1f191, 0x1f19a}, {0x1f1e6, 0x1f1ff}, {0x1f201, 0x1f202}, {0x1f21a, 0x1f21a}, {0x1f22f, 0x1f22f}, {0x1f232, 0x1f23a}, {0x1f250, 0x1f251}, {0x1f300, 0x1f320}, {0x1f330, 0x1f335}, {0x1f337, 0x1f37c}, {0x1f380, 0x1f393}, {0x1f3a0, 0x1f3c4}, {0x1f3c6, 0x1f3ca}, {0x1f3e0, 0x1f3f0}, {0x1f400, 0x1f43e}, {0x1f440, 0x1f440}, {0x1f442, 0x1f4f7}, {0x1f4f9, 0x1f4fc}, {0x1f500, 0x1f53d}, {0x1f550, 0x1f567}, {0x1f5fb, 0x1f640}, {0x1f645, 0x1f64f}, {0x1f680, 0x1f6c5}, } func Is(r rune) Class { if r < 0x100 { if r == ' ' || r == '\t' || r == 0 || r == 0xa0 { return 0 // blank } if !unicode.IsPunct(r) { return 2 // word } return 1 // punctation } for _, clazz := range classes { if clazz.first < r && r < clazz.last { return Class(clazz.value) } } for _, v := range emoji { if v.first < r && r < v.last { return 3 // emoji } } return 2 // word } func Split(s string) []string { scan := bufio.NewScanner(strings.NewReader(s)) scan.Split(SplitClass) var words []string for scan.Scan() { words = append(words, scan.Text()) } return words } func SplitClass(data []byte, atEOF bool) (int, []byte, error) { if atEOF && len(data) == 0 { return 0, nil, nil } bpos := 0 b := data last := Invalid for { r, i := utf8.DecodeRune(b) if i == 0 { break } clazz := Is(r) if last == -1 { last = clazz } else if clazz != last { last = clazz break } bpos += i b = b[i:] } if !atEOF && !utf8.FullRune(b) { return 0, nil, nil } return bpos, data[:bpos], nil } go-unicodeclass-0.0.1/unicodeclass_test.go000066400000000000000000000021561354456214100206420ustar00rootroot00000000000000package unicodeclass import ( "bufio" "reflect" "strings" "testing" ) func TestIs(t *testing.T) { tests := []struct { r rune want Class }{ {'1', Word}, {'a', Word}, {'世', CJKIdeographs}, {',', Punctation}, {' ', Blank}, } for _, test := range tests { got := Is(test.r) if got != test.want { t.Fatalf("want %v but %v: %q(%d)", test.want, got, string(test.r), test.r) } } } func TestUnicodeSplit(t *testing.T) { scan := bufio.NewScanner(strings.NewReader("本日は晴天なり")) scan.Split(SplitClass) var got []string for scan.Scan() { got = append(got, scan.Text()) } want := []string{"本日", "は", "晴天", "なり"} if !reflect.DeepEqual(got, want) { t.Fatalf("want %v but %v", want, got) } } func TestSplit(t *testing.T) { tests := []struct { s string want []string }{ {"本日は晴天なり", []string{"本日", "は", "晴天", "なり"}}, {"佐藤B作", []string{"佐藤", "B", "作"}}, } for _, test := range tests { got := Split(test.s) if !reflect.DeepEqual(got, test.want) { t.Fatalf("want %v but %v for %q", test.want, got, test.s) } } }