123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625 |
- #! /usr/bin/python
- # coding=UTF-8
- ## This script masks specific patterns in the input string.
- ##
- ## Current version: 1.0
- ##
- import sys, optparse, re
- ##======================================================================
- ## main
- def main(argv=None):
- if argv is None:
- arv = sys.argv
-
- parser = optparse.OptionParser(usage="%prog [OPTIONS]" +
- "\nMasks specific patterns in the input sentence.", version="%prog 1.0")
- parser.add_option("-a", "--all", help="mask every implemented pattern", dest="maskAll", action="store_true")
- ############### New
- parser.add_option("-g", "--lbegin", help="clean beginning of line", dest="maskLBegin", action="store_true")
- parser.add_option("-s", "--specialstr", help="mask special strings (e.g. C4:8A:1D:)", dest="maskSpecialStr", action="store_true")
- ############### New
- parser.add_option("-o", "--memaddr", help="mask memory addresses", dest="maskMemAddr", action="store_true")
- parser.add_option("-r", "--regkey", help="mask windows registry key", dest="maskRegKey", action="store_true")
- parser.add_option("-p", "--pholder", help="mask placeholders", dest="maskPHolder", action="store_true")
- parser.add_option("-l", "--listnum", help="mask list numbers", dest="maskListNum", action="store_true")
- parser.add_option("-m", "--menuitem", help="mask menu item", dest="maskMenuItem", action="store_true")
- parser.add_option("-t", "--tag", help="mask HTML/XML tags", dest="maskTag", action="store_true")
- parser.add_option("-v", "--var", help="mask variables", dest="maskVar", action="store_true")
- parser.add_option("-b", "--bslash", help="replace \n \r \t with white space", dest="maskBSlash", action="store_true")
- parser.add_option("-e", "--email", help="mask Email", dest="maskEmail", action="store_true")
- parser.add_option("-u", "--url", help="mask URL", dest="maskURL", action="store_true")
- parser.add_option("-w", "--winpath", help="mask Windows file path", dest="maskWPath", action="store_true")
- parser.add_option("-x", "--unixpath", help="mask Unix file path", dest="maskUPath", action="store_true")
- parser.add_option("-i", "--ipverno", help="mask IP address and version number", dest="maskIPV", action="store_true")
- parser.add_option("-d", "--datetime", help="mask date/time", dest="maskDateTime", action="store_true")
- ############### New
- parser.add_option("-n", "--number", help="mask numbers", dest="maskNum", action="store_true")
- parser.add_option("--ab", help="mask angle brackets", dest="maskABracket", action="store_true")
- parser.add_option("--rb", help="mask round brackets", dest="maskRBracket", action="store_true")
- parser.add_option("--sb", help="mask square brackets", dest="maskSBracket", action="store_true")
- parser.add_option("--useless", help="mask useless lines", dest="emptyUseless", action="store_true")
- ### add pattern options here
-
- (opts, posArgs) = parser.parse_args()
- # processing options
- ## overriding individual pattern options if --all is set
- ### remember to add new feature options here
- if opts.maskAll:
- ##################### New
- opts.maskNum = True
- opts.maskSpecialStr = True
- ##################### New
- opts.maskMemAddr = True
- opts.maskRegKey = True
- opts.maskPHolder = True
- opts.maskListNum = True
- opts.maskMenuItem = True
- opts.maskTag = True
- opts.maskVar = True
- opts.maskBSlash = True
- opts.maskEmail = True
- opts.maskURL = True
- opts.maskWPath = True
- opts.maskUPath = True
- opts.maskIPV = True
- opts.maskDateTime = True
- ##################### New
- opts.maskLBegin = True
- opts.maskABracket = True
- opts.maskRBracket = True
- opts.maskSBracket = True
-
-
- # variables to keep track of the number of replacement for each pattern.
- ###################### New
- vTotalLBegins = 0
- vLBeginLines = 0
- vTotalSStrs = 0
- vSStrLines = 0
- ###################### New
- vTotalMemAddrs = 0
- vMemAddrLines = 0
- vTotalRegKeys = 0
- vRegKeyLines = 0
- vTotalPHolders = 0
- vPHolderLines = 0
- vTotalLNums = 0
- vLNumLines = 0
- vTotalMItems = 0
- vMItemLines = 0
- vTotalTags = 0
- vTagLines = 0
- vTotalVars = 0
- vVarLines = 0
- vTotalBSlashs = 0
- vBSlashLines = 0
- vTotalEmails = 0
- vEmailLines = 0
- vTotalURLs = 0
- vURLLines = 0
- vTotalWPaths = 0
- vWPathLines = 0
- vTotalUPaths = 0
- vUPathLines = 0
- vTotalIPVs = 0
- vIPVLines = 0
- vTotalDTs = 0
- vDTLines = 0
- ###################### New
- vTotalNums = 0
- vNumLines = 0
- vTotalABrackets = 0
- vABracketLines = 0
- vTotalRBrackets = 0
- vRBracketLines = 0
- vTotalSBrackets = 0
- vSBracketLines = 0
- vTotalAdjacentMasks = 0
- vAMaskLines = 0
- vTotalUseless = 0
-
- # reading input string from stdin and processing
-
- ## Note: in cases where the mask is surrounded by a space, the intention
- ## is to treat fusion alongside. E.g. follow:http://. -> follow -URL- .
- ## Since this adds an unwanted space for non-fused cases, a tokenization
- ## or cleaning of trailing spaces is advised.
-
- vLineCntr = 0
- while True:
- vInput = sys.stdin.readline()
- if not vInput.strip():
- break
- ############################## New
- # cleaning line beginnings
- if opts.maskLBegin:
- vPattern = re.compile(r"^[ \t]*(?:||-|•|·|�|§|©|\*)[ \t]*(?=[A-Za-z\d])")
- vInput, vNum1 = vPattern.subn(r"", vInput)
- vTotalLBegins += vNum1
- if vNum1 > 0:
- vLBeginLines += 1
- vNum1 = 0
-
- # masking Special strings seen in data
- if opts.maskSpecialStr:
- # e.g. 60:06:01:60:83:B0:11:00:4A:C4:8A:1D:
- vPattern = re.compile(r"(?:(?<=[\s#])|^)(?:[\d\w]{2,4}:){4,}[\d\w]*")
- vInput, vNum1 = vPattern.subn(r" -SPS-", vInput)
- vTotalSStrs += vNum1
- if vNum1 > 0:
- vSStrLines += 1
- vNum1 = 0
- #################### New
- # masking adjacent identical masks
- vPattern = re.compile(r"-SPS-(?:[ \t]*-SPS-)+")
- vInput = vPattern.sub(r"-SPS-", vInput)
- # masking memeory addresses
- if opts.maskMemAddr:
- # e.g. 0x88888888
- vPattern = re.compile(r"0x[\d\w]+", re.IGNORECASE)
- vInput, vNum1 = vPattern.subn(r"-MEM-", vInput)
- vTotalMemAddrs += vNum1
- ######################## New
- # This pattern was found in already filtered test data
- vPattern = re.compile(r"<MEM_ADDR>", re.IGNORECASE)
- vInput, vNum2 = vPattern.subn(r"-MEM-", vInput)
- vTotalMemAddrs += vNum2
- if vNum1 + vNum2 > 0:
- vMemAddrLines += 1
- vNum1 = 0
- vNum2 = 0
- # masking adjacent identical masks
- vPattern = re.compile(r"-MEM-(?:[ \t]*-SPS-)+")
- vInput = vPattern.sub(r"-MEM-", vInput)
- # masking windows registry keys
- if opts.maskRegKey:
- #vPattern = re.compile(r"(^|[ #(\[])HKEY_.*?([ #).\n\]])")
- ################# Edited
- vPattern = re.compile(r"(?:HKEY_|HKLM).*?((?:[\s#\)\"',\]])|\.[\s])")
- vInput, vNum1 = vPattern.subn(r" -WRK- \1", vInput)
- vTotalRegKeys += vNum1
- if vNum1 > 0:
- vRegKeyLines += 1
- vNum1 = 0
- #################### New
- # masking adjacent identical masks
- vPattern = re.compile(r"-WRK-(?:[ \t]*-WRK-)+")
- vInput = vPattern.sub(r"-WRK-", vInput)
- # masking placeholders: \{...\}
- # Note: mask after windows registry keys, before windows file path
- if opts.maskPHolder:
- vPattern = re.compile(r"\\\{.*?\\\}")
- vInput, vNum1 = vPattern.subn(r" -PHL- ", vInput)
- vTotalPHolders += vNum1
- if vNum1 > 0:
- vPHolderLines += 1
- vNum1 = 0
- #################### New
- # masking adjacent identical masks
- vPattern = re.compile(r"-PHL-(?:[ \t]*-PHL-)+")
- vInput = vPattern.sub(r"-PHL-", vInput)
- # masking list numbers
- # Note: mask before angle-bracket-related patterns
- if opts.maskListNum:
- # N., N-, N)
- #################### Edited
- vPattern = re.compile(r"^\s*\d+\s*[\).-](?=\s+[^\d]|$)")
- vInput, vNum1 = vPattern.subn(r"-LNM- ", vInput)
- vTotalLNums += vNum1
- # <N>
- vPattern = re.compile(r"^\s*<\s*\d+\s*>")
- vInput, vNum2 = vPattern.subn(r"-LNM- ", vInput)
- vTotalLNums += vNum2
- # A., A-, A)
- vPattern = re.compile(r"^\s*\w\s*[\).-](?=\s+[^\d])")
- vInput, vNum3 = vPattern.subn(r"-LNM- ", vInput)
- vTotalLNums += vNum3
- if vNum1 + vNum2 + vNum3> 0:
- vLNumLines += 1
- vNum1 = 0
- vNum2 = 0
- vNum3 = 0
- #################### New
- # masking adjacent identical masks
- vPattern = re.compile(r"-LNM-(?:[ \t]*-LNM-)+")
- vInput = vPattern.sub(r"-LNM-", vInput)
- # masking menu items
- # Note: mask before variable
- ## For example, in Click < >Reports < >, the menu item will be
- ## replaced with "Reports". If needed, the quotation marks will
- ## be mask later. In fact, this is not masking but editing.
- if opts.maskMenuItem:
- vPattern = re.compile(r"<\s*>\s*([^<>]*?)\s*<\s*>")
- vInput, vNum1 = vPattern.subn(r'"\1"', vInput)
- vTotalMItems += vNum1
- if vNum1 > 0:
- vMItemLines += 1
- vNum1 = 0
- # masking HTML/XML tags
- # Note: masks the whole tag
- # Note: mask after menu items above and before variable
- if opts.maskTag:
- # HTML-like
- vPattern = re.compile(r"<\w+\s*\w+\=.*?[>/].*>")
- vInput, vNum1 = vPattern.subn(r" -TAG- ", vInput)
- vTotalTags += vNum1
- # XML-like
- vPattern = re.compile(r"<\w+\:.*?[>/].*>")
- vInput, vNum2 = vPattern.subn(r" -TAG- ", vInput)
- vTotalTags += vNum2
- if vNum1 + vNum2 > 0:
- vTagLines += 1
- vNum1 = 0
- vNum2 = 0
- #################### New
- # masking adjacent identical masks
- vPattern = re.compile(r"-TAG-(?:[ \t]*-TAG-)+")
- vInput = vPattern.sub(r"-TAG-", vInput)
-
- # masking variables
- # Note: mask after menu items above
- if opts.maskVar:
- # e.g. <token >AssignOrgGroupTask < >
- vPattern = re.compile(r"<[\w-]+\s*>\s*[^<>]*?\s*<\s*>")
- vInput, vNum1 = vPattern.subn(r" -VAR- ", vInput)
- vTotalVars += vNum1
- # fusions like [..]X...
- # Note: the masked pattern could even be removed!
- vPattern = re.compile(r"(?:\s|^)\[[^\]]*?\]([A-Z])")
- vInput, vNum2 = vPattern.subn(r"-VAR- \1", vInput)
- vTotalVars += vNum2
- # [..], <..>
- vPattern = re.compile(r"[<\[].*?[>\]]")
- vInput, vNum3 = vPattern.subn(r"-VAR-", vInput)
- vTotalVars += vNum3
- if vNum1 + vNum2 + vNum3 > 0:
- vVarLines += 1
- vNum1 = 0
- vNum2 = 0
- vNum3 = 0
- #################### New
- # masking adjacent identical masks
- vPattern = re.compile(r"-VAR-(?:[ \t]*-VAR-)+")
- vInput = vPattern.sub(r"-VAR-", vInput)
- # replacing \n \r \t with whitespace
- # Note: mask before windows file path
- if opts.maskBSlash:
- vPattern = re.compile(r"(?:\s+(?:\\+[ntr])+)+")
- vInput, vNum1 = vPattern.subn(r" ", vInput)
- vTotalBSlashs += vNum1
- vPattern = re.compile(r"(?:([?.,\[\}]+)(?:\\+[ntr])+)+")
- vInput, vNum2 = vPattern.subn(r"\1 ", vInput)
- vTotalBSlashs += vNum2
- vPattern = re.compile(r"(?:\\+[ntr])+$")
- vInput, vNum3 = vPattern.subn(r"", vInput)
- vTotalBSlashs += vNum3
- # this should be done last
- vPattern = re.compile(r"(?:\\+[ntr]){2,}")
- vInput, vNum4 = vPattern.subn(r" ", vInput)
- vTotalBSlashs += vNum4
- if vNum1 + vNum2 + vNum3 + vNum4 > 0:
- vBSlashLines += 1
- vNum1 = 0
- vNum2 = 0
- vNum3 = 0
- vNum4 = 0
- # masking Email
- # Note: Email should be masked before URL
- if opts.maskEmail:
- # Note: This pattern masks more than emails.
- ################### Edited
- vPattern = re.compile(r"\b[^\s@]+@[\w\d].*?([\s,\)]|\.\s)", re.IGNORECASE)
- vInput, vNum1 = vPattern.subn(r" -EML-\1", vInput)
- vTotalEmails += vNum1
- if vNum1 > 0:
- vEmailLines += 1
- vNum1 = 0
- #################### New
- # masking adjacent identical masks
- vPattern = re.compile(r"-EML-(?:[ \t]*-EML-)+")
- vInput = vPattern.sub(r"-EML-", vInput)
- # masking URL
- if opts.maskURL:
- ############################ Edited
- vPattern = re.compile(r"(t?ftp:|https?:).*?([\s)\"',\]\}]|\.[\s])", re.IGNORECASE)
- vInput, vNum1 = vPattern.subn(r" -URL- \2", vInput)
- vTotalURLs += vNum1
- vPattern = re.compile(r"[w]{2,}.*?\..*?([\s)\"',\]\}]|\.[\s])", re.IGNORECASE)
- vInput, vNum2 = vPattern.subn(r" -URL- \1", vInput)
- vTotalURLs += vNum2
- vPattern = re.compile(r"\b[\S]{2,}[.](?:com|net|org|co\.uk|ru|ch|de|fr)\b[\S]*?([\s)\"',\]\}]|\.[\s])", re.IGNORECASE)
- vInput, vNum3 = vPattern.subn(r" -URL- \1", vInput)
- vTotalURLs += vNum3
- ######################## New
- # This pattern was found in already filtered test data
- vPattern = re.compile(r"<(?:url_path|url_path|url_path)>", re.IGNORECASE)
- vInput, vNum4 = vPattern.subn(r"-URL-", vInput)
- vTotalURLs += vNum4
- if vNum1 + vNum2 + vNum3 + vNum4 > 0:
- vURLLines += 1
- vNum1 = 0
- vNum2 = 0
- vNum3 = 0
- vNum4 = 0
- #################### New
- # masking adjacent identical masks
- vPattern = re.compile(r"-URL-(?:[ \t]*-URL-)+")
- vInput = vPattern.sub(r"-URL-", vInput)
- # masking windows file paths
- if opts.maskWPath:
- ## This pattern treats paths containing whitespace. The results
- ## will be consumed by the next patterns to fully mask the path.
- ## Note that it's not included in counting.
- vPattern = re.compile(r"\\[A-Za-z0-9_-]+(?:[\s]+[A-Za-z0-9_-]+){1,3}(?=\\)")
- vInput, vNum1 = vPattern.subn(r"\\WFP", vInput)
- # mathcing some known whitespace issues not captured above
- vPattern = re.compile(r"\\enterprise vault", re.IGNORECASE)
- vInput, vNum2 = vPattern.subn(r"\\WFP", vInput)
- # this pattern utilizes "" to handle paths containing whitespace
- #vPattern = re.compile(r"\s\"[A-Za-z]:[^\s]*?\\+.*?\"")
- vPattern = re.compile(r"[\"'][^\s]*?\\+.*?[\"']")
- vInput, vNum3 = vPattern.subn(r" -WFP- ", vInput)
- vTotalWPaths += vNum3
- vPattern = re.compile(r"(?:\b|\\)[\\]*(?:[^\s/]+?\\+)+[^\s/]*\b")
- vInput, vNum4 = vPattern.subn(r" -WFP- ", vInput)
- vTotalWPaths += vNum4
- ## This pattern is meant to match file names. However, it may
- ## also match domain names which are missed above in URL masking
- ## as well as some fused period. Also, it does not cover any
- ## possible file name.
- ######################## Edited
- #vPattern = re.compile(r"(['\"\(\s])[\w\d-]+\.[\w\d-]+(['\"\)\s])")
- vPattern = re.compile(r"(^|[^\w\d.-])[\w\d-]{2,}(?:\.[\w\d-]{2,})+(['\"\)\s.])")
- vInput, vNum5 = vPattern.subn(r"\1-WFP-\2", vInput)
- vTotalWPaths += vNum5
- ######################## New
- # This pattern was found in already filtered test data
- vPattern = re.compile(r"<winpath>", re.IGNORECASE)
- vInput, vNum6 = vPattern.subn(r"-WFP-", vInput)
- vTotalWPaths += vNum6
- if vNum3 + vNum4 + vNum5 + vNum6 > 0:
- vWPathLines += 1
- vNum1 = 0
- vNum2 = 0
- vNum3 = 0
- vNum4 = 0
- vNum5 = 0
- vNum6 = 0
- #################### New
- # merging masks with various neighbours (e.g. -WFP- /)
- vPattern = re.compile(r"-WFP-\s*\\+(?=[\s#)\"',\]\}]|\.[\s])")
- vInput = vPattern.sub(r" -WFP- ", vInput)
- # masking adjacent identical masks
- vPattern = re.compile(r"-WFP-(?:[ \t]*-WFP-)+")
- vInput = vPattern.sub(r"-WFP-", vInput)
- # masking Unix-like file paths
- if opts.maskUPath:
- ################### Edited
- ## This pattern treats paths containing whitespace. The results
- ## will be consumed by the next patterns to fully mask the path.
- ## Note that it's not included in counting.
- vPattern = re.compile(r"/[A-Za-z0-9_-]{2,}(?:[\s]+[A-Za-z0-9_-]{2,}){1,3}(?=/)")
- vInput, vNum1 = vPattern.subn(r"/UFP", vInput)
- # this pattern utilizes "" to handle paths containing whitespace
- #vPattern = re.compile(r"\s\"[A-Za-z]:[^\s]*?\\+.*?\"")
- vPattern = re.compile(r"[\"'][\S]*?/+.*?[\"']")
- vInput, vNum2 = vPattern.subn(r" -UFP- ", vInput)
- vTotalUPaths += vNum2
- #################### Edited
- ## This pattern matchs string with more than one / to avoid
- ## masking non-path paterns like Usersyyyyyy/Symantec
- vPattern = re.compile(r"(?:\b|/)[/]*(?:[^\s/]+?/+){2,}[^\s/]*\b")
- vInput, vNum3 = vPattern.subn(r" -UFP- ", vInput)
- vTotalUPaths += vNum3
- if vNum2 + vNum3 > 0:
- vUPathLines += 1
- vNum1 = 0
- vNum2 = 0
- vNum3 = 0
- #################### New
- # masking adjacent identical masks
- vPattern = re.compile(r"-UFP-(?:[ \t]*-UFP-)+")
- vInput = vPattern.sub(r"-UFP-", vInput)
- # masking IP addresses and version number
- ## Note: Since it is not easy to distiguish between IP and version,
- ## they are masked together.
- if opts.maskIPV:
- vPattern = re.compile(r"\b[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}(?::[\d]{1,5})?(?:/[\d]{1,2})?(?=[\D])")
- vInput, vNum1 = vPattern.subn(r"-IPV-", vInput)
- vTotalIPVs += vNum1
- ######################## New
- vPattern = re.compile(r"\b[vV]?[\d]{1,2}\.[\d]{1,2}(?:\.[\d]{1,5})+(?=[\D])")
- vInput, vNum2 = vPattern.subn(r"-IPV-", vInput)
- vTotalIPVs += vNum2
- # This pattern was found in already filtered test data
- vPattern = re.compile(r"<(?:ip_address|adresse_ip|ip_ver)>", re.IGNORECASE)
- vInput, vNum3 = vPattern.subn(r"-IPV-", vInput)
- vTotalIPVs += vNum3
- if vNum1 + vNum2 + vNum3 > 0:
- vIPVLines += 1
- vNum1 = 0
- vNum2 = 0
- vNum3 = 0
- #################### New
- # masking adjacent identical masks
- vPattern = re.compile(r"-IPV-(?:[ \t]*-IPV-)+")
- vInput = vPattern.sub(r"-IPV-", vInput)
- # masking date/times
- ## Note: Date and time are masked separately. They'll be merged
- ## later, when all neighboring similar maskes are mereged in a
- ## post-processing run.
- if opts.maskDateTime:
- # times
- vPattern = re.compile(r"(?:(?<=[^\d:])|^)[\d]{1,2}:[\d]{2}(?::[\d]{2})?(?:\.[\d]+)?\s*(?:AM|A\.M\.|PM|P\.M\.)?\s*(?:GMT|UTC|CET|EDT|EST|EET|PDT|PST)?(?=[^\d])", re.IGNORECASE)
- vInput, vNum1 = vPattern.subn(r"-DTM- ", vInput)
- vTotalDTs += vNum1
- #vPattern = re.compile(r"\b\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{2,4}")
- #vInput, vNum1 = vPattern.subn(r" -DTM- ", vInput)
- #vTotalDTs += vNum1
- ######################## New
- # This pattern was found in already filtered test data
- vPattern = re.compile(r"<date_and_time>", re.IGNORECASE)
- vInput, vNum9 = vPattern.subn(r"-DTM-", vInput)
- vTotalDTs += vNum9
- if vNum1 + vNum9 > 0:
- vDTLines += 1
- vNum1 = 0
- vNum9 = 0
- #################### New
- # masking adjacent identical masks
- vPattern = re.compile(r"-DTM-(?:[ \t]*-DTM-)+")
- vInput = vPattern.sub(r"-DTM-", vInput)
- #################### New
-
- # masking numbers
- ## Note: Numbers are masked with a fixed number pattern like 999
- if opts.maskNum:
- # money
- vPattern = re.compile(r"(?P<curr>[£€$]|GBP|AUD|NZD|SGD)[ \t]*\d+(?:,\d+)*(?:\.\d+)?")
- vInput, vNum1 = vPattern.subn(r"\g<curr>999", vInput)
- vTotalNums += vNum1
- # any number sequence
- ## Note: do this after all number-related masking
- vPattern = re.compile(r"\d+(?:,\d+)*(?:\.\d+)?")
- vInput, vNum2 = vPattern.subn(r"999", vInput)
- vTotalNums += vNum2
- if vNum1 + vNum2 > 0:
- vNumLines += 1
- vNum1 = 0
- vNum2 = 0
-
- # merging adjacent non-identical masks
- # Note: do before masing special characters
- #vPattern = re.compile("(?:-[A-Z]{3}-)[ \t]*(?:\1[ \t]*){1,}")
- #vInput, vNum1 = vPattern.subn(r" \1 ", vInput)
- #vTotalAdjacentMasks += vNum1
- #if vNum1 > 0:
- # vAMaskLines += 1
- # vNum1 = 0
-
-
- # masking all special characters
- ## Note: this should be done after all pattern maskings, unless
- ## one is needed before a specific pattern is masked.
- ## Note: to be able to count maskings, we don't use replace()
-
- # masking angle brackets
- if opts.maskABracket:
- vPattern = re.compile("<")
- vInput, vNum1 = vPattern.subn(r" -LAB- ", vInput)
- vTotalABrackets += vNum1
- vPattern = re.compile(">")
- vInput, vNum2 = vPattern.subn(r" -RAB- ", vInput)
- vTotalABrackets += vNum2
- if vNum1 + vNum2 > 0:
- vABracketLines += 1
- vNum1 = 0
- vNum2 = 0
- # masking round brackets
- if opts.maskRBracket:
- vPattern = re.compile("\(")
- vInput, vNum1 = vPattern.subn(r" -LRB- ", vInput)
- vTotalRBrackets += vNum1
- vPattern = re.compile("\)")
- vInput, vNum2 = vPattern.subn(r" -RRB- ", vInput)
- vTotalRBrackets += vNum2
- if vNum1 + vNum2 > 0:
- vRBracketLines += 1
- vNum1 = 0
- vNum2 = 0
-
- # masking square brackets
- if opts.maskSBracket:
- vPattern = re.compile("\[")
- vInput, vNum1 = vPattern.subn(r" -LAB- ", vInput)
- vTotalSBrackets += vNum1
- vPattern = re.compile("\]")
- vInput, vNum2 = vPattern.subn(r" -RSB- ", vInput)
- vTotalSBrackets += vNum2
- if vNum1 + vNum2 > 0:
- vSBracketLines += 1
- vNum1 = 0
- vNum2 = 0
-
- # masking lines useless for translation
- if opts.emptyUseless:
- vUseful = False
-
- # all lines with only tokens starting with a mask string
- vlTokens = vInput.split()
- for vToken in vlTokens:
- if re.match("(:?[^\s\w\d]*-[A-Z]{3}-[^\s\w\d]*)|(?:[^\s\w\d]*999[^\s\w\d]*)", vToken) == None:
- vUseful = True
- break
-
- ## all lines not containing a full word with at least 2
- ## character length
- if vUseful:
- vPattern = re.compile(r"\b[A-Za-z]{2,}\b")
- if len(re.findall(vPattern, vInput)) == 0:
- vUseful = False
-
- if not vUseful:
- vInput = "\n"
- vTotalUseless += 1
- # writing final output
- print vInput,
- vLineCntr += 1
- sys.stderr.write(str(vLineCntr) + " lines processed\n")
- ################# New
- sys.stderr.write(str(vTotalLBegins) + " line beginnings in " + str(vLBeginLines) + " lines cleaned\n")
- sys.stderr.write(str(vTotalSStrs) + " special strings in " + str(vSStrLines) + " lines masked\n")
- ################# New
- sys.stderr.write(str(vTotalMemAddrs) + " memory addresses in " + str(vMemAddrLines) + " lines masked\n")
- sys.stderr.write(str(vTotalRegKeys) + " windows registry keys in " + str(vRegKeyLines) + " lines masked\n")
- sys.stderr.write(str(vTotalPHolders) + " placeholders in " + str(vPHolderLines) + " lines masked\n")
- sys.stderr.write(str(vTotalLNums) + " list numbers in " + str(vLNumLines) + " lines masked\n")
- sys.stderr.write(str(vTotalMItems) + " menu paths in " + str(vMItemLines) + " lines masked\n")
- sys.stderr.write(str(vTotalTags) + " HTML/XML tags in " + str(vTagLines) + " lines masked\n")
- sys.stderr.write(str(vTotalVars) + " variables in " + str(vVarLines) + " lines masked\n")
- sys.stderr.write(str(vTotalBSlashs) + " newline/tab/.. in " + str(vBSlashLines) + " lines masked\n")
- sys.stderr.write(str(vTotalEmails) + " Emails in " + str(vEmailLines) + " lines masked\n")
- sys.stderr.write(str(vTotalURLs) + " URLs in " + str(vURLLines) + " lines masked\n")
- sys.stderr.write(str(vTotalWPaths) + " Windows file pathes in " + str(vWPathLines) + " lines masked\n")
- sys.stderr.write(str(vTotalUPaths) + " Unix file pathes in " + str(vUPathLines) + " lines masked\n")
- sys.stderr.write(str(vTotalIPVs) + " IP addresses and version numbers in " + str(vIPVLines) + " lines masked\n")
- sys.stderr.write(str(vTotalDTs) + " date/times in " + str(vDTLines) + " lines masked\n")
- ################# New
- sys.stderr.write(str(vTotalNums) + " numbers in " + str(vNumLines) + " lines masked\n")
- sys.stderr.write(str(vTotalABrackets) + " angle brackets in " + str(vABracketLines) + " lines masked\n")
- sys.stderr.write(str(vTotalRBrackets) + " round brackets in " + str(vRBracketLines) + " lines masked\n")
- sys.stderr.write(str(vTotalSBrackets) + " square brackets in " + str(vSBracketLines) + " lines masked\n")
- sys.stderr.write(str(vTotalAdjacentMasks) + " adjacent masks in " + str(vAMaskLines) + " lines masked\n")
- sys.stderr.write(str(vTotalUseless) + " useless lines emptied\n")
- ##======================================================================
- ## calling main
- if __name__ == "__main__":
- sys.exit(main())
|