mask1.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625
  1. #! /usr/bin/python
  2. # coding=UTF-8
  3. ## This script masks specific patterns in the input string.
  4. ##
  5. ## Current version: 1.0
  6. ##
  7. import sys, optparse, re
  8. ##======================================================================
  9. ## main
  10. def main(argv=None):
  11. if argv is None:
  12. arv = sys.argv
  13. parser = optparse.OptionParser(usage="%prog [OPTIONS]" +
  14. "\nMasks specific patterns in the input sentence.", version="%prog 1.0")
  15. parser.add_option("-a", "--all", help="mask every implemented pattern", dest="maskAll", action="store_true")
  16. ############### New
  17. parser.add_option("-g", "--lbegin", help="clean beginning of line", dest="maskLBegin", action="store_true")
  18. parser.add_option("-s", "--specialstr", help="mask special strings (e.g. C4:8A:1D:)", dest="maskSpecialStr", action="store_true")
  19. ############### New
  20. parser.add_option("-o", "--memaddr", help="mask memory addresses", dest="maskMemAddr", action="store_true")
  21. parser.add_option("-r", "--regkey", help="mask windows registry key", dest="maskRegKey", action="store_true")
  22. parser.add_option("-p", "--pholder", help="mask placeholders", dest="maskPHolder", action="store_true")
  23. parser.add_option("-l", "--listnum", help="mask list numbers", dest="maskListNum", action="store_true")
  24. parser.add_option("-m", "--menuitem", help="mask menu item", dest="maskMenuItem", action="store_true")
  25. parser.add_option("-t", "--tag", help="mask HTML/XML tags", dest="maskTag", action="store_true")
  26. parser.add_option("-v", "--var", help="mask variables", dest="maskVar", action="store_true")
  27. parser.add_option("-b", "--bslash", help="replace \n \r \t with white space", dest="maskBSlash", action="store_true")
  28. parser.add_option("-e", "--email", help="mask Email", dest="maskEmail", action="store_true")
  29. parser.add_option("-u", "--url", help="mask URL", dest="maskURL", action="store_true")
  30. parser.add_option("-w", "--winpath", help="mask Windows file path", dest="maskWPath", action="store_true")
  31. parser.add_option("-x", "--unixpath", help="mask Unix file path", dest="maskUPath", action="store_true")
  32. parser.add_option("-i", "--ipverno", help="mask IP address and version number", dest="maskIPV", action="store_true")
  33. parser.add_option("-d", "--datetime", help="mask date/time", dest="maskDateTime", action="store_true")
  34. ############### New
  35. parser.add_option("-n", "--number", help="mask numbers", dest="maskNum", action="store_true")
  36. parser.add_option("--ab", help="mask angle brackets", dest="maskABracket", action="store_true")
  37. parser.add_option("--rb", help="mask round brackets", dest="maskRBracket", action="store_true")
  38. parser.add_option("--sb", help="mask square brackets", dest="maskSBracket", action="store_true")
  39. parser.add_option("--useless", help="mask useless lines", dest="emptyUseless", action="store_true")
  40. ### add pattern options here
  41. (opts, posArgs) = parser.parse_args()
  42. # processing options
  43. ## overriding individual pattern options if --all is set
  44. ### remember to add new feature options here
  45. if opts.maskAll:
  46. ##################### New
  47. opts.maskNum = True
  48. opts.maskSpecialStr = True
  49. ##################### New
  50. opts.maskMemAddr = True
  51. opts.maskRegKey = True
  52. opts.maskPHolder = True
  53. opts.maskListNum = True
  54. opts.maskMenuItem = True
  55. opts.maskTag = True
  56. opts.maskVar = True
  57. opts.maskBSlash = True
  58. opts.maskEmail = True
  59. opts.maskURL = True
  60. opts.maskWPath = True
  61. opts.maskUPath = True
  62. opts.maskIPV = True
  63. opts.maskDateTime = True
  64. ##################### New
  65. opts.maskLBegin = True
  66. opts.maskABracket = True
  67. opts.maskRBracket = True
  68. opts.maskSBracket = True
  69. # variables to keep track of the number of replacement for each pattern.
  70. ###################### New
  71. vTotalLBegins = 0
  72. vLBeginLines = 0
  73. vTotalSStrs = 0
  74. vSStrLines = 0
  75. ###################### New
  76. vTotalMemAddrs = 0
  77. vMemAddrLines = 0
  78. vTotalRegKeys = 0
  79. vRegKeyLines = 0
  80. vTotalPHolders = 0
  81. vPHolderLines = 0
  82. vTotalLNums = 0
  83. vLNumLines = 0
  84. vTotalMItems = 0
  85. vMItemLines = 0
  86. vTotalTags = 0
  87. vTagLines = 0
  88. vTotalVars = 0
  89. vVarLines = 0
  90. vTotalBSlashs = 0
  91. vBSlashLines = 0
  92. vTotalEmails = 0
  93. vEmailLines = 0
  94. vTotalURLs = 0
  95. vURLLines = 0
  96. vTotalWPaths = 0
  97. vWPathLines = 0
  98. vTotalUPaths = 0
  99. vUPathLines = 0
  100. vTotalIPVs = 0
  101. vIPVLines = 0
  102. vTotalDTs = 0
  103. vDTLines = 0
  104. ###################### New
  105. vTotalNums = 0
  106. vNumLines = 0
  107. vTotalABrackets = 0
  108. vABracketLines = 0
  109. vTotalRBrackets = 0
  110. vRBracketLines = 0
  111. vTotalSBrackets = 0
  112. vSBracketLines = 0
  113. vTotalAdjacentMasks = 0
  114. vAMaskLines = 0
  115. vTotalUseless = 0
  116. # reading input string from stdin and processing
  117. ## Note: in cases where the mask is surrounded by a space, the intention
  118. ## is to treat fusion alongside. E.g. follow:http://. -> follow -URL- .
  119. ## Since this adds an unwanted space for non-fused cases, a tokenization
  120. ## or cleaning of trailing spaces is advised.
  121. vLineCntr = 0
  122. while True:
  123. vInput = sys.stdin.readline()
  124. if not vInput.strip():
  125. break
  126. ############################## New
  127. # cleaning line beginnings
  128. if opts.maskLBegin:
  129. vPattern = re.compile(r"^[ \t]*(?:||-|•|·|�|§|©|\*)[ \t]*(?=[A-Za-z\d])")
  130. vInput, vNum1 = vPattern.subn(r"", vInput)
  131. vTotalLBegins += vNum1
  132. if vNum1 > 0:
  133. vLBeginLines += 1
  134. vNum1 = 0
  135. # masking Special strings seen in data
  136. if opts.maskSpecialStr:
  137. # e.g. 60:06:01:60:83:B0:11:00:4A:C4:8A:1D:
  138. vPattern = re.compile(r"(?:(?<=[\s#])|^)(?:[\d\w]{2,4}:){4,}[\d\w]*")
  139. vInput, vNum1 = vPattern.subn(r" -SPS-", vInput)
  140. vTotalSStrs += vNum1
  141. if vNum1 > 0:
  142. vSStrLines += 1
  143. vNum1 = 0
  144. #################### New
  145. # masking adjacent identical masks
  146. vPattern = re.compile(r"-SPS-(?:[ \t]*-SPS-)+")
  147. vInput = vPattern.sub(r"-SPS-", vInput)
  148. # masking memeory addresses
  149. if opts.maskMemAddr:
  150. # e.g. 0x88888888
  151. vPattern = re.compile(r"0x[\d\w]+", re.IGNORECASE)
  152. vInput, vNum1 = vPattern.subn(r"-MEM-", vInput)
  153. vTotalMemAddrs += vNum1
  154. ######################## New
  155. # This pattern was found in already filtered test data
  156. vPattern = re.compile(r"<MEM_ADDR>", re.IGNORECASE)
  157. vInput, vNum2 = vPattern.subn(r"-MEM-", vInput)
  158. vTotalMemAddrs += vNum2
  159. if vNum1 + vNum2 > 0:
  160. vMemAddrLines += 1
  161. vNum1 = 0
  162. vNum2 = 0
  163. # masking adjacent identical masks
  164. vPattern = re.compile(r"-MEM-(?:[ \t]*-SPS-)+")
  165. vInput = vPattern.sub(r"-MEM-", vInput)
  166. # masking windows registry keys
  167. if opts.maskRegKey:
  168. #vPattern = re.compile(r"(^|[ #(\[])HKEY_.*?([ #).\n\]])")
  169. ################# Edited
  170. vPattern = re.compile(r"(?:HKEY_|HKLM).*?((?:[\s#\)\"',\]])|\.[\s])")
  171. vInput, vNum1 = vPattern.subn(r" -WRK- \1", vInput)
  172. vTotalRegKeys += vNum1
  173. if vNum1 > 0:
  174. vRegKeyLines += 1
  175. vNum1 = 0
  176. #################### New
  177. # masking adjacent identical masks
  178. vPattern = re.compile(r"-WRK-(?:[ \t]*-WRK-)+")
  179. vInput = vPattern.sub(r"-WRK-", vInput)
  180. # masking placeholders: \{...\}
  181. # Note: mask after windows registry keys, before windows file path
  182. if opts.maskPHolder:
  183. vPattern = re.compile(r"\\\{.*?\\\}")
  184. vInput, vNum1 = vPattern.subn(r" -PHL- ", vInput)
  185. vTotalPHolders += vNum1
  186. if vNum1 > 0:
  187. vPHolderLines += 1
  188. vNum1 = 0
  189. #################### New
  190. # masking adjacent identical masks
  191. vPattern = re.compile(r"-PHL-(?:[ \t]*-PHL-)+")
  192. vInput = vPattern.sub(r"-PHL-", vInput)
  193. # masking list numbers
  194. # Note: mask before angle-bracket-related patterns
  195. if opts.maskListNum:
  196. # N., N-, N)
  197. #################### Edited
  198. vPattern = re.compile(r"^\s*\d+\s*[\).-](?=\s+[^\d]|$)")
  199. vInput, vNum1 = vPattern.subn(r"-LNM- ", vInput)
  200. vTotalLNums += vNum1
  201. # <N>
  202. vPattern = re.compile(r"^\s*<\s*\d+\s*>")
  203. vInput, vNum2 = vPattern.subn(r"-LNM- ", vInput)
  204. vTotalLNums += vNum2
  205. # A., A-, A)
  206. vPattern = re.compile(r"^\s*\w\s*[\).-](?=\s+[^\d])")
  207. vInput, vNum3 = vPattern.subn(r"-LNM- ", vInput)
  208. vTotalLNums += vNum3
  209. if vNum1 + vNum2 + vNum3> 0:
  210. vLNumLines += 1
  211. vNum1 = 0
  212. vNum2 = 0
  213. vNum3 = 0
  214. #################### New
  215. # masking adjacent identical masks
  216. vPattern = re.compile(r"-LNM-(?:[ \t]*-LNM-)+")
  217. vInput = vPattern.sub(r"-LNM-", vInput)
  218. # masking menu items
  219. # Note: mask before variable
  220. ## For example, in Click < >Reports < >, the menu item will be
  221. ## replaced with "Reports". If needed, the quotation marks will
  222. ## be mask later. In fact, this is not masking but editing.
  223. if opts.maskMenuItem:
  224. vPattern = re.compile(r"<\s*>\s*([^<>]*?)\s*<\s*>")
  225. vInput, vNum1 = vPattern.subn(r'"\1"', vInput)
  226. vTotalMItems += vNum1
  227. if vNum1 > 0:
  228. vMItemLines += 1
  229. vNum1 = 0
  230. # masking HTML/XML tags
  231. # Note: masks the whole tag
  232. # Note: mask after menu items above and before variable
  233. if opts.maskTag:
  234. # HTML-like
  235. vPattern = re.compile(r"<\w+\s*\w+\=.*?[>/].*>")
  236. vInput, vNum1 = vPattern.subn(r" -TAG- ", vInput)
  237. vTotalTags += vNum1
  238. # XML-like
  239. vPattern = re.compile(r"<\w+\:.*?[>/].*>")
  240. vInput, vNum2 = vPattern.subn(r" -TAG- ", vInput)
  241. vTotalTags += vNum2
  242. if vNum1 + vNum2 > 0:
  243. vTagLines += 1
  244. vNum1 = 0
  245. vNum2 = 0
  246. #################### New
  247. # masking adjacent identical masks
  248. vPattern = re.compile(r"-TAG-(?:[ \t]*-TAG-)+")
  249. vInput = vPattern.sub(r"-TAG-", vInput)
  250. # masking variables
  251. # Note: mask after menu items above
  252. if opts.maskVar:
  253. # e.g. <token >AssignOrgGroupTask < >
  254. vPattern = re.compile(r"<[\w-]+\s*>\s*[^<>]*?\s*<\s*>")
  255. vInput, vNum1 = vPattern.subn(r" -VAR- ", vInput)
  256. vTotalVars += vNum1
  257. # fusions like [..]X...
  258. # Note: the masked pattern could even be removed!
  259. vPattern = re.compile(r"(?:\s|^)\[[^\]]*?\]([A-Z])")
  260. vInput, vNum2 = vPattern.subn(r"-VAR- \1", vInput)
  261. vTotalVars += vNum2
  262. # [..], <..>
  263. vPattern = re.compile(r"[<\[].*?[>\]]")
  264. vInput, vNum3 = vPattern.subn(r"-VAR-", vInput)
  265. vTotalVars += vNum3
  266. if vNum1 + vNum2 + vNum3 > 0:
  267. vVarLines += 1
  268. vNum1 = 0
  269. vNum2 = 0
  270. vNum3 = 0
  271. #################### New
  272. # masking adjacent identical masks
  273. vPattern = re.compile(r"-VAR-(?:[ \t]*-VAR-)+")
  274. vInput = vPattern.sub(r"-VAR-", vInput)
  275. # replacing \n \r \t with whitespace
  276. # Note: mask before windows file path
  277. if opts.maskBSlash:
  278. vPattern = re.compile(r"(?:\s+(?:\\+[ntr])+)+")
  279. vInput, vNum1 = vPattern.subn(r" ", vInput)
  280. vTotalBSlashs += vNum1
  281. vPattern = re.compile(r"(?:([?.,\[\}]+)(?:\\+[ntr])+)+")
  282. vInput, vNum2 = vPattern.subn(r"\1 ", vInput)
  283. vTotalBSlashs += vNum2
  284. vPattern = re.compile(r"(?:\\+[ntr])+$")
  285. vInput, vNum3 = vPattern.subn(r"", vInput)
  286. vTotalBSlashs += vNum3
  287. # this should be done last
  288. vPattern = re.compile(r"(?:\\+[ntr]){2,}")
  289. vInput, vNum4 = vPattern.subn(r" ", vInput)
  290. vTotalBSlashs += vNum4
  291. if vNum1 + vNum2 + vNum3 + vNum4 > 0:
  292. vBSlashLines += 1
  293. vNum1 = 0
  294. vNum2 = 0
  295. vNum3 = 0
  296. vNum4 = 0
  297. # masking Email
  298. # Note: Email should be masked before URL
  299. if opts.maskEmail:
  300. # Note: This pattern masks more than emails.
  301. ################### Edited
  302. vPattern = re.compile(r"\b[^\s@]+@[\w\d].*?([\s,\)]|\.\s)", re.IGNORECASE)
  303. vInput, vNum1 = vPattern.subn(r" -EML-\1", vInput)
  304. vTotalEmails += vNum1
  305. if vNum1 > 0:
  306. vEmailLines += 1
  307. vNum1 = 0
  308. #################### New
  309. # masking adjacent identical masks
  310. vPattern = re.compile(r"-EML-(?:[ \t]*-EML-)+")
  311. vInput = vPattern.sub(r"-EML-", vInput)
  312. # masking URL
  313. if opts.maskURL:
  314. ############################ Edited
  315. vPattern = re.compile(r"(t?ftp:|https?:).*?([\s)\"',\]\}]|\.[\s])", re.IGNORECASE)
  316. vInput, vNum1 = vPattern.subn(r" -URL- \2", vInput)
  317. vTotalURLs += vNum1
  318. vPattern = re.compile(r"[w]{2,}.*?\..*?([\s)\"',\]\}]|\.[\s])", re.IGNORECASE)
  319. vInput, vNum2 = vPattern.subn(r" -URL- \1", vInput)
  320. vTotalURLs += vNum2
  321. vPattern = re.compile(r"\b[\S]{2,}[.](?:com|net|org|co\.uk|ru|ch|de|fr)\b[\S]*?([\s)\"',\]\}]|\.[\s])", re.IGNORECASE)
  322. vInput, vNum3 = vPattern.subn(r" -URL- \1", vInput)
  323. vTotalURLs += vNum3
  324. ######################## New
  325. # This pattern was found in already filtered test data
  326. vPattern = re.compile(r"<(?:url_path|url_path|url_path)>", re.IGNORECASE)
  327. vInput, vNum4 = vPattern.subn(r"-URL-", vInput)
  328. vTotalURLs += vNum4
  329. if vNum1 + vNum2 + vNum3 + vNum4 > 0:
  330. vURLLines += 1
  331. vNum1 = 0
  332. vNum2 = 0
  333. vNum3 = 0
  334. vNum4 = 0
  335. #################### New
  336. # masking adjacent identical masks
  337. vPattern = re.compile(r"-URL-(?:[ \t]*-URL-)+")
  338. vInput = vPattern.sub(r"-URL-", vInput)
  339. # masking windows file paths
  340. if opts.maskWPath:
  341. ## This pattern treats paths containing whitespace. The results
  342. ## will be consumed by the next patterns to fully mask the path.
  343. ## Note that it's not included in counting.
  344. vPattern = re.compile(r"\\[A-Za-z0-9_-]+(?:[\s]+[A-Za-z0-9_-]+){1,3}(?=\\)")
  345. vInput, vNum1 = vPattern.subn(r"\\WFP", vInput)
  346. # mathcing some known whitespace issues not captured above
  347. vPattern = re.compile(r"\\enterprise vault", re.IGNORECASE)
  348. vInput, vNum2 = vPattern.subn(r"\\WFP", vInput)
  349. # this pattern utilizes "" to handle paths containing whitespace
  350. #vPattern = re.compile(r"\s\"[A-Za-z]:[^\s]*?\\+.*?\"")
  351. vPattern = re.compile(r"[\"'][^\s]*?\\+.*?[\"']")
  352. vInput, vNum3 = vPattern.subn(r" -WFP- ", vInput)
  353. vTotalWPaths += vNum3
  354. vPattern = re.compile(r"(?:\b|\\)[\\]*(?:[^\s/]+?\\+)+[^\s/]*\b")
  355. vInput, vNum4 = vPattern.subn(r" -WFP- ", vInput)
  356. vTotalWPaths += vNum4
  357. ## This pattern is meant to match file names. However, it may
  358. ## also match domain names which are missed above in URL masking
  359. ## as well as some fused period. Also, it does not cover any
  360. ## possible file name.
  361. ######################## Edited
  362. #vPattern = re.compile(r"(['\"\(\s])[\w\d-]+\.[\w\d-]+(['\"\)\s])")
  363. vPattern = re.compile(r"(^|[^\w\d.-])[\w\d-]{2,}(?:\.[\w\d-]{2,})+(['\"\)\s.])")
  364. vInput, vNum5 = vPattern.subn(r"\1-WFP-\2", vInput)
  365. vTotalWPaths += vNum5
  366. ######################## New
  367. # This pattern was found in already filtered test data
  368. vPattern = re.compile(r"<winpath>", re.IGNORECASE)
  369. vInput, vNum6 = vPattern.subn(r"-WFP-", vInput)
  370. vTotalWPaths += vNum6
  371. if vNum3 + vNum4 + vNum5 + vNum6 > 0:
  372. vWPathLines += 1
  373. vNum1 = 0
  374. vNum2 = 0
  375. vNum3 = 0
  376. vNum4 = 0
  377. vNum5 = 0
  378. vNum6 = 0
  379. #################### New
  380. # merging masks with various neighbours (e.g. -WFP- /)
  381. vPattern = re.compile(r"-WFP-\s*\\+(?=[\s#)\"',\]\}]|\.[\s])")
  382. vInput = vPattern.sub(r" -WFP- ", vInput)
  383. # masking adjacent identical masks
  384. vPattern = re.compile(r"-WFP-(?:[ \t]*-WFP-)+")
  385. vInput = vPattern.sub(r"-WFP-", vInput)
  386. # masking Unix-like file paths
  387. if opts.maskUPath:
  388. ################### Edited
  389. ## This pattern treats paths containing whitespace. The results
  390. ## will be consumed by the next patterns to fully mask the path.
  391. ## Note that it's not included in counting.
  392. vPattern = re.compile(r"/[A-Za-z0-9_-]{2,}(?:[\s]+[A-Za-z0-9_-]{2,}){1,3}(?=/)")
  393. vInput, vNum1 = vPattern.subn(r"/UFP", vInput)
  394. # this pattern utilizes "" to handle paths containing whitespace
  395. #vPattern = re.compile(r"\s\"[A-Za-z]:[^\s]*?\\+.*?\"")
  396. vPattern = re.compile(r"[\"'][\S]*?/+.*?[\"']")
  397. vInput, vNum2 = vPattern.subn(r" -UFP- ", vInput)
  398. vTotalUPaths += vNum2
  399. #################### Edited
  400. ## This pattern matchs string with more than one / to avoid
  401. ## masking non-path paterns like Usersyyyyyy/Symantec
  402. vPattern = re.compile(r"(?:\b|/)[/]*(?:[^\s/]+?/+){2,}[^\s/]*\b")
  403. vInput, vNum3 = vPattern.subn(r" -UFP- ", vInput)
  404. vTotalUPaths += vNum3
  405. if vNum2 + vNum3 > 0:
  406. vUPathLines += 1
  407. vNum1 = 0
  408. vNum2 = 0
  409. vNum3 = 0
  410. #################### New
  411. # masking adjacent identical masks
  412. vPattern = re.compile(r"-UFP-(?:[ \t]*-UFP-)+")
  413. vInput = vPattern.sub(r"-UFP-", vInput)
  414. # masking IP addresses and version number
  415. ## Note: Since it is not easy to distiguish between IP and version,
  416. ## they are masked together.
  417. if opts.maskIPV:
  418. vPattern = re.compile(r"\b[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}(?::[\d]{1,5})?(?:/[\d]{1,2})?(?=[\D])")
  419. vInput, vNum1 = vPattern.subn(r"-IPV-", vInput)
  420. vTotalIPVs += vNum1
  421. ######################## New
  422. vPattern = re.compile(r"\b[vV]?[\d]{1,2}\.[\d]{1,2}(?:\.[\d]{1,5})+(?=[\D])")
  423. vInput, vNum2 = vPattern.subn(r"-IPV-", vInput)
  424. vTotalIPVs += vNum2
  425. # This pattern was found in already filtered test data
  426. vPattern = re.compile(r"<(?:ip_address|adresse_ip|ip_ver)>", re.IGNORECASE)
  427. vInput, vNum3 = vPattern.subn(r"-IPV-", vInput)
  428. vTotalIPVs += vNum3
  429. if vNum1 + vNum2 + vNum3 > 0:
  430. vIPVLines += 1
  431. vNum1 = 0
  432. vNum2 = 0
  433. vNum3 = 0
  434. #################### New
  435. # masking adjacent identical masks
  436. vPattern = re.compile(r"-IPV-(?:[ \t]*-IPV-)+")
  437. vInput = vPattern.sub(r"-IPV-", vInput)
  438. # masking date/times
  439. ## Note: Date and time are masked separately. They'll be merged
  440. ## later, when all neighboring similar maskes are mereged in a
  441. ## post-processing run.
  442. if opts.maskDateTime:
  443. # times
  444. vPattern = re.compile(r"(?:(?<=[^\d:])|^)[\d]{1,2}:[\d]{2}(?::[\d]{2})?(?:\.[\d]+)?\s*(?:AM|A\.M\.|PM|P\.M\.)?\s*(?:GMT|UTC|CET|EDT|EST|EET|PDT|PST)?(?=[^\d])", re.IGNORECASE)
  445. vInput, vNum1 = vPattern.subn(r"-DTM- ", vInput)
  446. vTotalDTs += vNum1
  447. #vPattern = re.compile(r"\b\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2,4}")
  448. #vInput, vNum1 = vPattern.subn(r" -DTM- ", vInput)
  449. #vTotalDTs += vNum1
  450. ######################## New
  451. # This pattern was found in already filtered test data
  452. vPattern = re.compile(r"<date_and_time>", re.IGNORECASE)
  453. vInput, vNum9 = vPattern.subn(r"-DTM-", vInput)
  454. vTotalDTs += vNum9
  455. if vNum1 + vNum9 > 0:
  456. vDTLines += 1
  457. vNum1 = 0
  458. vNum9 = 0
  459. #################### New
  460. # masking adjacent identical masks
  461. vPattern = re.compile(r"-DTM-(?:[ \t]*-DTM-)+")
  462. vInput = vPattern.sub(r"-DTM-", vInput)
  463. #################### New
  464. # masking numbers
  465. ## Note: Numbers are masked with a fixed number pattern like 999
  466. if opts.maskNum:
  467. # money
  468. vPattern = re.compile(r"(?P<curr>[£€$]|GBP|AUD|NZD|SGD)[ \t]*\d+(?:,\d+)*(?:\.\d+)?")
  469. vInput, vNum1 = vPattern.subn(r"\g<curr>999", vInput)
  470. vTotalNums += vNum1
  471. # any number sequence
  472. ## Note: do this after all number-related masking
  473. vPattern = re.compile(r"\d+(?:,\d+)*(?:\.\d+)?")
  474. vInput, vNum2 = vPattern.subn(r"999", vInput)
  475. vTotalNums += vNum2
  476. if vNum1 + vNum2 > 0:
  477. vNumLines += 1
  478. vNum1 = 0
  479. vNum2 = 0
  480. # merging adjacent non-identical masks
  481. # Note: do before masing special characters
  482. #vPattern = re.compile("(?:-[A-Z]{3}-)[ \t]*(?:\1[ \t]*){1,}")
  483. #vInput, vNum1 = vPattern.subn(r" \1 ", vInput)
  484. #vTotalAdjacentMasks += vNum1
  485. #if vNum1 > 0:
  486. # vAMaskLines += 1
  487. # vNum1 = 0
  488. # masking all special characters
  489. ## Note: this should be done after all pattern maskings, unless
  490. ## one is needed before a specific pattern is masked.
  491. ## Note: to be able to count maskings, we don't use replace()
  492. # masking angle brackets
  493. if opts.maskABracket:
  494. vPattern = re.compile("<")
  495. vInput, vNum1 = vPattern.subn(r" -LAB- ", vInput)
  496. vTotalABrackets += vNum1
  497. vPattern = re.compile(">")
  498. vInput, vNum2 = vPattern.subn(r" -RAB- ", vInput)
  499. vTotalABrackets += vNum2
  500. if vNum1 + vNum2 > 0:
  501. vABracketLines += 1
  502. vNum1 = 0
  503. vNum2 = 0
  504. # masking round brackets
  505. if opts.maskRBracket:
  506. vPattern = re.compile("\(")
  507. vInput, vNum1 = vPattern.subn(r" -LRB- ", vInput)
  508. vTotalRBrackets += vNum1
  509. vPattern = re.compile("\)")
  510. vInput, vNum2 = vPattern.subn(r" -RRB- ", vInput)
  511. vTotalRBrackets += vNum2
  512. if vNum1 + vNum2 > 0:
  513. vRBracketLines += 1
  514. vNum1 = 0
  515. vNum2 = 0
  516. # masking square brackets
  517. if opts.maskSBracket:
  518. vPattern = re.compile("\[")
  519. vInput, vNum1 = vPattern.subn(r" -LAB- ", vInput)
  520. vTotalSBrackets += vNum1
  521. vPattern = re.compile("\]")
  522. vInput, vNum2 = vPattern.subn(r" -RSB- ", vInput)
  523. vTotalSBrackets += vNum2
  524. if vNum1 + vNum2 > 0:
  525. vSBracketLines += 1
  526. vNum1 = 0
  527. vNum2 = 0
  528. # masking lines useless for translation
  529. if opts.emptyUseless:
  530. vUseful = False
  531. # all lines with only tokens starting with a mask string
  532. vlTokens = vInput.split()
  533. for vToken in vlTokens:
  534. if re.match("(:?[^\s\w\d]*-[A-Z]{3}-[^\s\w\d]*)|(?:[^\s\w\d]*999[^\s\w\d]*)", vToken) == None:
  535. vUseful = True
  536. break
  537. ## all lines not containing a full word with at least 2
  538. ## character length
  539. if vUseful:
  540. vPattern = re.compile(r"\b[A-Za-z]{2,}\b")
  541. if len(re.findall(vPattern, vInput)) == 0:
  542. vUseful = False
  543. if not vUseful:
  544. vInput = "\n"
  545. vTotalUseless += 1
  546. # writing final output
  547. print vInput,
  548. vLineCntr += 1
  549. sys.stderr.write(str(vLineCntr) + " lines processed\n")
  550. ################# New
  551. sys.stderr.write(str(vTotalLBegins) + " line beginnings in " + str(vLBeginLines) + " lines cleaned\n")
  552. sys.stderr.write(str(vTotalSStrs) + " special strings in " + str(vSStrLines) + " lines masked\n")
  553. ################# New
  554. sys.stderr.write(str(vTotalMemAddrs) + " memory addresses in " + str(vMemAddrLines) + " lines masked\n")
  555. sys.stderr.write(str(vTotalRegKeys) + " windows registry keys in " + str(vRegKeyLines) + " lines masked\n")
  556. sys.stderr.write(str(vTotalPHolders) + " placeholders in " + str(vPHolderLines) + " lines masked\n")
  557. sys.stderr.write(str(vTotalLNums) + " list numbers in " + str(vLNumLines) + " lines masked\n")
  558. sys.stderr.write(str(vTotalMItems) + " menu paths in " + str(vMItemLines) + " lines masked\n")
  559. sys.stderr.write(str(vTotalTags) + " HTML/XML tags in " + str(vTagLines) + " lines masked\n")
  560. sys.stderr.write(str(vTotalVars) + " variables in " + str(vVarLines) + " lines masked\n")
  561. sys.stderr.write(str(vTotalBSlashs) + " newline/tab/.. in " + str(vBSlashLines) + " lines masked\n")
  562. sys.stderr.write(str(vTotalEmails) + " Emails in " + str(vEmailLines) + " lines masked\n")
  563. sys.stderr.write(str(vTotalURLs) + " URLs in " + str(vURLLines) + " lines masked\n")
  564. sys.stderr.write(str(vTotalWPaths) + " Windows file pathes in " + str(vWPathLines) + " lines masked\n")
  565. sys.stderr.write(str(vTotalUPaths) + " Unix file pathes in " + str(vUPathLines) + " lines masked\n")
  566. sys.stderr.write(str(vTotalIPVs) + " IP addresses and version numbers in " + str(vIPVLines) + " lines masked\n")
  567. sys.stderr.write(str(vTotalDTs) + " date/times in " + str(vDTLines) + " lines masked\n")
  568. ################# New
  569. sys.stderr.write(str(vTotalNums) + " numbers in " + str(vNumLines) + " lines masked\n")
  570. sys.stderr.write(str(vTotalABrackets) + " angle brackets in " + str(vABracketLines) + " lines masked\n")
  571. sys.stderr.write(str(vTotalRBrackets) + " round brackets in " + str(vRBracketLines) + " lines masked\n")
  572. sys.stderr.write(str(vTotalSBrackets) + " square brackets in " + str(vSBracketLines) + " lines masked\n")
  573. sys.stderr.write(str(vTotalAdjacentMasks) + " adjacent masks in " + str(vAMaskLines) + " lines masked\n")
  574. sys.stderr.write(str(vTotalUseless) + " useless lines emptied\n")
  575. ##======================================================================
  576. ## calling main
  577. if __name__ == "__main__":
  578. sys.exit(main())