@@ -609,7 +609,7 @@ DOLSIGN2 = [\u00A2-\u00A5\u0080\u20A0-\u20BF\u058F\u060B\u09F2\u09F3\u0AF1\u0BF9
609
609
/* not used DOLLAR {DOLSIGN}[ \t]*{NUMBER} */
610
610
/* |\( ?{NUMBER} ?\)) # is for pound signs */
611
611
FILENAME_EXT = 3gp| avi| bat| bmp| bz2| c| class| cgi| cpp| dll| doc| docx| exe| flv| gif| gz| h| hei[ cf] | htm| html| jar| java| jpeg| jpg| mov| mp[ 34g] | mpeg| o| pdf| php| pl| png| ppt| ps| py| sql| tar| txt| wav| x| xml| zip| wm[ va]
612
- FILENAME = [\p{Alpha}\p{Digit}] +( [- . _/#][\p{Alpha}\p{Digit}] +)* \. {FILENAME_EXT}
612
+ FILENAME = [\p{Alpha}\p{Digit}] +( [- ~.! _/#][\p{Alpha}\p{Digit}] +)* \. {FILENAME_EXT}
613
613
/* Curse of intelligent tokenization, here we come. To model what LDC does, we separate out some \p{Digit}+\p{Alpha}+ tokens as 2 words */
614
614
/* Go with just the top 20 currencies. */
615
615
SEP_CURRENCY = ( USD| EUR| JPY| GBP| AUD| CAD| CHF| CNY| SEK| NZD| MXN| SGD| HKD| NOK| KRW| TRY| RUB| INR| BRL| ZAR)
@@ -666,8 +666,8 @@ SREDAUX = n{APOSETCETERA}t
666
666
/* Note that Jflex doesn't support {2,} form. Only {2,k}. */
667
667
/* [yY]' is for Y'know, y'all and I for I. So exclude from one letter first */
668
668
/* Rest are for French borrowings. n allows n'ts in "don'ts" */
669
- /* Arguably, c'mon should be split to "c'm" + "on", but not yet. */
670
- APOWORD = {APOS} n{APOS} ?| [ lLdDjJ] {APOS} | Dunkin{APOS} | somethin{APOS} | ol{APOS} | {APOS} em| diff{APOSETCETERA} rent| [ A- HJ- XZn] {APOSETCETERA} [:letter:]{2}[:letter:]*| {APOS} [ 1- 9] 0s| [ 1- 9] 0{APOS} s| {APOS} till?|[:letter:][:letter:]* [ aeiouyAEIOUY] {APOSETCETERA} [ aeioulA- Z] [:letter:]*| {APOS} cause| cont'd\. ?| nor'easter| c'mon| e'er| s'mores| ev'ry| li'l| nat'l| ass't| O{APOSETCETERA} o
669
+ /* Arguably, c'mon should be split to "c'm" + "on", but not yet. 'Twixt for betwixt */
670
+ APOWORD = {APOS} n{APOS} ?| [ lLdDjJ] {APOS} | Dunkin{APOS} | somethin{APOS} | ol{APOS} | {APOS} em| diff{APOSETCETERA} rent| [ A- HJ- XZn] {APOSETCETERA} [:letter:]{2}[:letter:]*| {APOS} [ 1- 9] 0s| [ 1- 9] 0{APOS} s| {APOS} till?|[:letter:][:letter:]* [ aeiouyAEIOUY] {APOSETCETERA} [ aeioulA- Z] [:letter:]*| {APOS} cause| cont'd\. ?| nor'easter| c'mon| e'er| s'mores| ev'ry| li'l| nat'l| ass't| 'twixt | O{APOSETCETERA} o
671
671
APOWORD2 = y{APOS}
672
672
/* Some Wired URLs end in + or = so omit that too. Some quoting with '[' and ']' so disallow. */
673
673
FULLURL = ( ftp| svn| svn\+ ssh| http| https| mailto) :\/\/ [^ \t\n\f\r <>|`\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}] + [^ \t\n\f\r <>|.!?¡¿,·;:&`\"\'\* \p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-]
@@ -709,12 +709,13 @@ ABNUM = tel|est|ext|sq
709
709
is now caseless. We don't want to have it recognized for P. Both
710
710
p. and P. are now under ABBREV4. ABLIST also went away as no-op [a-e].
711
711
Dr. Sci. is a degree some places. */
712
- ABPTIT = Jr| Sr| Bros|( Ed| Ph) \. D| B \. Sc| LL\. [ BM ] | Esq| Sci
712
+ ABPTIT = Jr| Sr| Bros|( Ed| Ph) \. D| [ BDM ] \. Sc| LL\. [ BDM ] | Esq| Sci
713
713
/* ss?p and aff are for bio taxonomy; also gen and cf but appear elsewhere as ABBREV4 already; fl for flourished. var for variety */
714
714
ABTAXONOMY = ( s( ub)?)? spp?| aff| [ f][ l] | var
715
715
/* Notes: many misspell etc. ect.; kr. is some other currency. eg. for e.g. */
716
- /* Tech would be useful for Indian B. Tech. degrees, but "tech" is used too much as a word. Avg = average*/
717
- ABVARIA = etc| ect| al| seq| Bldg| Pls| wrt| orig| incl| t[ b] ? [ s][ p] | kr| eg| Avg
716
+ /* Tech would be useful for Indian B. Tech. degrees, but "tech" is used too much as a word. Avg = average; pl. for plural */
717
+ /* Cir. for circuit court; lb for pounds */
718
+ ABVARIA = etc| ect| al| seq| Bldg| Pls| wrt| orig| incl| t[ b] ? [ s][ p] | kr| eg| Avg| pl| Cir| min| lb
718
719
719
720
/* ABBREV1 abbreviations are normally followed by lower case words.
720
721
* If they're followed by an uppercase one, we assume there is also a sentence boundary.
@@ -729,10 +730,11 @@ ACRO2 = [A-Za-z](\.[A-Za-z])+|(Canada|Sino|Korean|EU|Japan|non)-U\.S|U\.S\.-(U\.
729
730
/* ABTITLE is mainly person titles, but also Mt for mountains and Ft for Fort. St[ae] does Saint, Santa, suite, etc. */
730
731
/* "Rt." occurs both in "Rt. Rev." (capitalized following) and in abbreviation at end of Hungarian company (lower follows). */
731
732
/* Added "Amb" for Ambassador. Don't have "Ambs" as occurs as family name. Fr. for Friar */
732
- ABTITLE = Mr| Mrs| Ms| Mx| [ M] iss| Drs?| Profs?| Sens?| Reps?| Attys?| Lt| Col| Gen| Messrs| Govs?| Adm| Rev| Fr| Rt| Maj| Sgt| Cpl| Pvt| Capt| St[ ae] ?| Ave| Pres| Lieut| Rt| Hon| Brig| Co? mdr| Pfc| Spc| Supts?| Det| Mt| Ft| Adj| Adv| Asst| Assoc| Ens| Insp| Mlle| Mme| Msgr| Sfc| Amb
733
+ /* Smt. and Ven. before Indian names; Br for brother; Eng. for engineer (but is occasional Chinese name) */
734
+ ABTITLE = Mr| Mrs| Ms| Mx| [ M] iss| Drs?| Profs?| Sens?| Reps?| Attys?| Lt| Col| Gen| Messrs| Govs?| Adm| Rev| Fr| Rt| Maj| Sgt| Cpl| Pvt| Capt| St[ ae] ?| Ave| Pres| Lieut| Rt| Hon| Brig| Co? mdr| Pfc| Spc| Supts?| Det| Mt| Ft| Adj| Adv| Asst| Assoc| Ens| Insp| Mlle| Mme| Msgr| Sfc| Amb| Smt| Ven| Br| Eng
733
735
/* Exhs?. is used for law case exhibits. ass't = assistant, Govt = Government.
734
- Ph is in there for Ph. D Sc for B.Sc.*/
735
- ABCOMP2 = Invt| Elec| Natl| M[ ft] g| Dept| Blvd| Rd| Ave| [ P][ l] | viz| Exhs?| ass't| Govt| vs| [ v] | Wm| Jos| Cie| a\. k\. a| cf| TREAS| Ph| [ S][ c]
736
+ Ph is in there for Ph. D Sc for B.Sc. syn. for biology synonym; def. for defeated; Mk for Mark (like tank); Soc. for society */
737
+ ABCOMP2 = Invt| Elec| Natl| M[ ft] g| Dept| Blvd| Rd| Ave| [ P][ l] | viz| Exhs?| ass't| Govt| vs| [ v] | Wm| Jos| Cie| a\. k\. a| cf| TREAS| Ph| [ S][ c] | syn | def | Mk | Soc
736
738
737
739
/* ABRREV2 abbreviations are normally followed by an upper case word.
738
740
* We assume they aren't used sentence finally.
@@ -745,15 +747,14 @@ ACRONYM = ({ACRO})\.
745
747
/* In tables: Mkt. for market Div. for division of company, Chg., Yr.: year */
746
748
747
749
/* --- ABBREV3 abbreviations are allowed only before numbers. ---
748
- * Otherwise, they aren't recognized as abbreviations (unless they also
749
- * appear in ABBREV1 or ABBREV2).
750
+ * Otherwise, they aren't recognized as abbreviations (unless they also appear in ABBREV1 or ABBREV2).
750
751
* est. is "estimated" -- common in some financial contexts. ext. is extension, ca. is circa.
751
752
* "Art(s)." is for "article(s)" -- common in legal context, Sec(t). for section(s). ch for chapters.
752
753
* res for resolution (of Congress etc.)
753
754
*/
754
755
/* Maybe also "op." for "op. cit." but also get a photo op. Rs. for Rupees */
755
756
/* Pt for part needs to be case sensitive (vs. country code for Portugal). */
756
- ABBREV3 = ( ca| chs?| figs?| prop| nos?| vols?| sect? s?| arts?| paras?| bldg| prop| pp| op| approx| [ P ] [ t] | rs| Apt| Rt| Res) \.
757
+ ABBREV3 = ( ca| chs?| figs?| prop| nos?| nrs ?| vols?| sect? s?| arts?| paras?| bldg| prop| pp| op| approx| p [ t] | rs| Apt| Rt| Res) \.
757
758
/* Case for south/north before a few places. */
758
759
ABBREVSN = So\. | No\.
759
760
@@ -776,7 +777,8 @@ UNDS = _+
776
777
ASTS = \* +|( \\\* ){1,3}
777
778
HASHES = #+
778
779
FNMARKS = {ATS} | {HASHES} | {UNDS}
779
- INSENTP = [ ,;:\u3001 ]
780
+ /* U+3001 is Chinese dunhao comma; U+0F0D is Tibetan shad */
781
+ INSENTP = [ ,;:\u3001\u0F0D ]
780
782
QUOTES = {APOS} | [ `\u2018 - \u201F\u0082\u0084\u0091 - \u0094\u2039\u203A\u00AB\u00BB ] {1,2}
781
783
DBLQUOT = \" | "| [ `'\u0091\u0092\u2018\u2019 ] '
782
784
/* Cap'n for captain, c'est for french */
@@ -790,17 +792,18 @@ BANGMAGAZINES = OK\!
790
792
SMILEY = [ <>] ? [ :;=][ \- o\* '] ? [ \(\) DPdpO\\ {@\|\[\] ]
791
793
ASIANSMILEY = [ \^ x=~<>] \.\[\^ x =~ <>]| [ \-\^ x=~<>'] _[ \-\^ x=~<>'] | \( [ \-\^ x=~<>'][ _.] ? [ \-\^ x=~<>'] \) | \( [ \^ x=~<>'] -[ \^ x=~<>'`] \) | ¯\\ _\( ツ\) _\/ ¯
792
794
793
- /* Slightly generous but generally reasonable emoji parsing */
795
+ /* Slightly generous but generally reasonably good emoji parsing */
794
796
/* These are human emoji that can have a zwj gender (as well as skin color) */
795
797
EMOJI_GENDERED = [ \u26F9\u{01F3C3} - \u{01F3C4}\u{01F3CA} - \u{01F3CC}\u{01F466} - \u{01F469}\u{01F46E} - \u{01F46F}\u{01F471}\u{01F473}\u{01F477}\u{01F481} - \u{01F482}\u{01F486} - \u{01F487}\u{01F575}\u{01F645} - \u{01F647}\u{01F64B}\u{01F64D} - \u{01F64E}\u{01F6A3}\u{01F6B4} - \u{01F6B6}\u{01F926}\u{01F937} - \u{01F939}\u{01F93C} - \u{01F93E}\u{01F9D6} - \u{01F9DF} ]
796
- /* Emoji follower is variation selector (emoji/non-emoji rendering) or Fitzpatrick skin tone */
798
+ /* Emoji follow is variation selector (emoji/non-emoji rendering) or Fitzpatrick skin tone */
797
799
EMOJI_FOLLOW = [ \uFE0E\uFE0F\u{01F3FB} - \u{01F3FF} ]
798
800
/* Just things followed by the keycap surrounding char - note that if not separated by space beforehand, may be mistokenized */
799
801
EMOJI_KEYCAPS = [ \u0023\u002A\u0030 - \u0039 ] \uFE0F ? \u20E3
800
802
/* Two geographic characters as a flag or GB regions as flags
801
803
* (changed to use \U to avoid bug in IntelliJ JFlex plugin).
804
+ * Then second disjunct is emoji tag sequence (ETS) support for certain additional flags
802
805
*/
803
- EMOJI_FLAG = [ \U01F1E6 - \U01F1FF ] {2,2}| \U01F3F4\U0E0067\U0E0062 [ \U0E0061 - \U0E007A ] + \U0E007F
806
+ EMOJI_FLAG = [ \U01F1E6 - \U01F1FF ] {2,2}| \U01F3F4\U0E0067\U0E0062 [ \U0E0061 - \U0E007A ] + \U0E007F | \U01F3F4 [ \u{E0030} - \u{E0039}\u{E0061} - \u{E007A} ] + \U0E007F
804
807
/* Rainbow flag etc. */
805
808
EMOJI_MISC = [ \u{01F3F3}\u{01F441} ][ \uFE0E\uFE0F ] ? \u200D [ \u{01F308}\u{01F5E8} ][ \uFE0E\uFE0F ] ?| {EMOJI_KEYCAPS}
806
809
/* Things that have an emoji presentation form */
@@ -811,11 +814,13 @@ HUMAN_MODIFIER = [\u2640\u2642\u2695-\u2696\u2708\u2764\u{01F33E}\u{01F373}\u{01
811
814
EMOJI = {EMOJI_FLAG} | {EMOJI_PRESENTATION}{EMOJI_FOLLOW} ?| {EMOJI_GENDERED}{EMOJI_FOLLOW} ?( \u200D ( [ \u{01F466} - \u{01F469} ] {EMOJI_FOLLOW} ?| {HUMAN_MODIFIER} )){1,3}| {EMOJI_MISC}
812
815
813
816
/* U+2200-U+2BFF has a lot of the various mathematical, etc. symbol ranges */
814
- MISCSYMBOL = [ +%&~\^ |\\ ¦\u00A7 ¨\u00A9\u00AC\u00AE ¯\u00B0 - \u00B3\u00B4 - \u00BA\u00D7\u00F7\u0387\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0600 - \u0603\u0606 - \u060A\u060C\u0614\u061B\u061E\u066A\u066D\u0703 - \u070D\u07F6\u07F7\u07F8\u0964\u0965\u0E4F\u1FBD\u2016\u2017\u2020 - \u2025\u2030 - \u2038\u203B\u203C\u2043\u203E - \u2042\u2044\u207A - \u207F\u208A - \u208E\u2100 - \u214F\u2190 - \u21FF\u2200 - \u2BFF\u3001 - \u3006\u3008 - \u3020\u30FB\uFF01 - \uFF0F\uFF1A - \uFF20\uFF3B - \uFF40\uFF5B - \uFF65\uFF65 ]
815
817
/* \uFF65 is Halfwidth katakana middle dot; \u30FB is Katakana middle dot */
816
- /* Math and other symbols that stand alone: °²× ∀ */
818
+ /* Math and other symbols that stand alone: °²× ∀; \u33A1 is m^2 in one char! */
819
+ /* Tibetan tsheg or tsek (U+0F0B) goes between syllables; words aren't space separated, so it may be a word or syllable marker; it indicates a possible line-break point. Treat as separate symbol. */
820
+ MISCSYMBOL = [ +%&~\^ |\\ ¦\u00A7 ¨\u00A9\u00AC\u00AE ¯\u00B0 - \u00B3\u00B4 - \u00BA\u00D7\u00F7\u0387\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0600 - \u0603\u0606 - \u060A\u060C\u0614\u061B\u061E\u066A\u066D\u0703 - \u070D\u07F6\u07F7\u07F8\u0964\u0965\u0E4F\u0F0B\u1FBD\u2016\u2017\u2020 - \u2025\u2030 - \u2038\u203B\u203C\u2043\u203E - \u2042\u2044\u207A - \u207F\u208A - \u208E\u2100 - \u214F\u2190 - \u21FF\u2200 - \u2BFF\u3001 - \u3006\u3008 - \u3020\u30FB\u33A1\uFF01 - \uFF0F\uFF1A - \uFF20\uFF3B - \uFF40\uFF5B - \uFF65\uFF65 ]
817
821
818
822
PROG_LANGS = c[ +][ +] |( c| f) #
823
+ /* Assimilations3 leave 3 chars behind after division */
819
824
ASSIMILATIONS3 = cannot| 'twas| dunno| [ '’] d[ '’] ve
820
825
/* "nno" is a remnant after pushing back from dunno in ASSIMILATIONS3 */
821
826
/* Include splitting some apostrophe-less negations, but not ones like "wont" that are also words. */
@@ -1068,6 +1073,8 @@ RM/{NUM} { String txt = yytext();
1068
1073
<YyTokenizePerLine> {ABBREVSN} / {SPACE} +( Africa| Korea| Cal) { return getNext(); }
1069
1074
/* Special case to get pty. ltd. or pty limited. Also added "Co." since someone complained, but usually a comma after it. */
1070
1075
( pty| pte| pvt| co) \. / {SPACE} ( ltd| lim| llc) { return getNext(); }
1076
+ /* Special case to get op. cit.. or loc. cit. */
1077
+ ( op| loc) \. / {SPACE} cit\. { return getNext(); }
1071
1078
<YyNotTokenizePerLine> {ABBREV1} / {SENTEND1} {
1072
1079
return processAbbrev1();
1073
1080
}
0 commit comments