You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
62 lines
2.0 KiB
62 lines
2.0 KiB
import re,string
|
|
|
|
|
|
class reason_counter:
|
|
def __init__(self, wording):
|
|
self.wording = wording
|
|
self.num = 1
|
|
|
|
def update(self, new_wording):
|
|
self.num += 1
|
|
self.wording = new_wording
|
|
|
|
def html(self):
|
|
if self.num == 1:
|
|
return self.wording
|
|
else:
|
|
return "%s (%d+)" % (self.wording, self.num)
|
|
|
|
|
|
def numbers_are_irrelevant(txt):
|
|
## ? when do we replace numbers with NN ?
|
|
## By default is always, but
|
|
## if/when some categories of reasons choose to keep their numbers,
|
|
## then the function shall return False for such categories
|
|
return True
|
|
|
|
|
|
def aggregate_reason_fields(reasons_list):
|
|
# each reason in the list may be a combination
|
|
# of | - separated reasons.
|
|
# expand into list
|
|
reasons_txt = '|'.join(reasons_list)
|
|
reasons = reasons_txt.split('|')
|
|
reason_htable = {}
|
|
for reason in reasons:
|
|
reason_reduced = reason.strip()
|
|
## reduce whitespaces
|
|
reason_reduced = re.sub(r"\s+"," ", reason_reduced)
|
|
|
|
if reason_reduced == '':
|
|
continue # ignore empty reasons
|
|
|
|
if numbers_are_irrelevant(reason_reduced):
|
|
# reduce numbers included into reason descriptor
|
|
# by replacing them with generic NN
|
|
reason_reduced = re.sub(r"\d+","NN", reason_reduced)
|
|
|
|
if not reason_reduced in reason_htable:
|
|
reason_htable[reason_reduced] = reason_counter(reason)
|
|
else:
|
|
## reason_counter keeps original ( non reduced )
|
|
## reason if it occured once
|
|
## if reason occured more then once, reason_counter
|
|
## will keep it in reduced/generalized form
|
|
reason_htable[reason_reduced].update(reason_reduced)
|
|
|
|
generic_reasons = reason_htable.keys()
|
|
generic_reasons.sort(key = (lambda k: reason_htable[k].num),
|
|
reverse = True)
|
|
return map(lambda generic_reason: reason_htable[generic_reason].html(),
|
|
generic_reasons)
|