|
@@ -0,0 +1,89 @@
|
|
|
+## List of Non-breaking tokens containing period for English
|
|
|
+## 20-Sep-2012
|
|
|
+##
|
|
|
+## Adapted from Moses tokenizer
|
|
|
+##
|
|
|
+
|
|
|
+# titles
|
|
|
+
|
|
|
+Adj.
|
|
|
+Adm.
|
|
|
+Adv.
|
|
|
+Asst.
|
|
|
+Bart.
|
|
|
+Bldg.
|
|
|
+Brig.
|
|
|
+Bros.
|
|
|
+Capt.
|
|
|
+Cmdr.
|
|
|
+Col.
|
|
|
+Comdr.
|
|
|
+Con.
|
|
|
+Corp.
|
|
|
+Cpl.
|
|
|
+DR.
|
|
|
+#Dr. handled by ignoring case
|
|
|
+Drs.
|
|
|
+Ens.
|
|
|
+Gen.
|
|
|
+Gov.
|
|
|
+Hon.
|
|
|
+Hr.
|
|
|
+Hosp.
|
|
|
+Insp.
|
|
|
+Lt.
|
|
|
+MM.
|
|
|
+MR.
|
|
|
+MRS.
|
|
|
+MS.
|
|
|
+Maj.
|
|
|
+Messrs.
|
|
|
+Mlle.
|
|
|
+Mme.
|
|
|
+#Mr. handled by ignoring case
|
|
|
+#Mrs. handled by ignoring case
|
|
|
+#Ms. handled by ignoring case
|
|
|
+Msgr.
|
|
|
+Op.
|
|
|
+Ord.
|
|
|
+Pfc.
|
|
|
+Ph.
|
|
|
+Prof.
|
|
|
+Pvt.
|
|
|
+Rep.
|
|
|
+Reps.
|
|
|
+Res.
|
|
|
+Rev.
|
|
|
+Rt.
|
|
|
+Sen.
|
|
|
+Sens.
|
|
|
+Sfc.
|
|
|
+Sgt.
|
|
|
+Sr.
|
|
|
+St.
|
|
|
+Supt.
|
|
|
+Surg.
|
|
|
+# addded
|
|
|
+U.S
|
|
|
+U.S.
|
|
|
+U.S.A
|
|
|
+U.S.A.
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# Misc
|
|
|
+
|
|
|
+v.
|
|
|
+vs.
|
|
|
+i.e.
|
|
|
+rev.
|
|
|
+e.g.
|
|
|
+# assuming that the last period in p.m. and a.m. is never a punctuation
|
|
|
+p.m.
|
|
|
+a.m.
|
|
|
+
|
|
|
+# Added from Symantec data
|
|
|
+
|
|
|
+Ver.
|
|
|
+Misc.
|
|
|
+etc.
|