Browse Source

Added a new file

Rasoul Kaljahi 5 years ago
parent
commit
1ea03f0198
1 changed files with 89 additions and 0 deletions
  1. 89 0
      lib/nlp.py.nonbrk.en

+ 89 - 0
lib/nlp.py.nonbrk.en

@@ -0,0 +1,89 @@
+## List of Non-breaking tokens containing period for English
+##	20-Sep-2012
+##
+## Adapted from Moses tokenizer
+##
+
+# titles
+
+Adj.
+Adm.
+Adv.
+Asst.
+Bart.
+Bldg.
+Brig.
+Bros.
+Capt.
+Cmdr.
+Col.
+Comdr.
+Con.
+Corp.
+Cpl.
+DR.
+#Dr.											handled by ignoring case
+Drs.
+Ens.
+Gen.
+Gov.
+Hon.
+Hr.
+Hosp.
+Insp.
+Lt.
+MM.
+MR.
+MRS.
+MS.
+Maj.
+Messrs.
+Mlle.
+Mme.
+#Mr.											handled by ignoring case
+#Mrs.											handled by ignoring case
+#Ms.											handled by ignoring case
+Msgr.
+Op.
+Ord.
+Pfc.
+Ph.
+Prof.
+Pvt.
+Rep.
+Reps.
+Res.
+Rev.
+Rt.
+Sen.
+Sens.
+Sfc.
+Sgt.
+Sr.
+St.
+Supt.
+Surg.
+# addded 
+U.S
+U.S.
+U.S.A
+U.S.A.
+
+
+
+# Misc
+
+v.
+vs.
+i.e.
+rev.
+e.g.
+# assuming that the last period in p.m. and a.m. is never a punctuation
+p.m.
+a.m.
+
+# Added from Symantec data
+
+Ver.
+Misc.
+etc.