json_to_doc.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. #!/usr/bin/env python3
  2. import argparse
  3. import json
  4. import os
  5. import sys
  6. TEMPLATE = """<DOC>
  7. <DOCNO>{}</DOCNO>
  8. {}
  9. </DOC>"""
  10. def parse_arguments():
  11. parser = argparse.ArgumentParser()
  12. parser.add_argument('file_list')
  13. parser.add_argument('out_dir')
  14. return parser.parse_args()
  15. class Segmenter:
  16. def __init__(self, ranges, items):
  17. self.ranges = ranges
  18. self.segment_dict = self.create_segment_dict(ranges)
  19. self.active = []
  20. self.items = items
  21. def create_segment_dict(self, ranges):
  22. d = {}
  23. for s_time, e_time in ranges:
  24. d['{}-{}'.format(s_time, e_time)] = []
  25. return d
  26. def get_segment_dict(self):
  27. return self.segment_dict
  28. def run(self):
  29. self.active.append(self.ranges.pop())
  30. for item in self.items:
  31. start_time = float(item['startTime'].strip('s'))
  32. end_time = float(item['endTime'].strip('s'))
  33. if self.ranges:
  34. self.update_active(start_time, end_time)
  35. if self.active:
  36. self.update_complete(start_time, end_time)
  37. for active_range in self.active:
  38. key = '{}-{}'.format(*active_range)
  39. self.segment_dict[key].append(item['word'])
  40. def update_active(self, start_time, end_time):
  41. next_range = self.ranges[-1]
  42. if start_time >= next_range[0] and end_time < next_range[1]:
  43. self.active.append(self.ranges.pop())
  44. elif start_time <= next_range[0] and end_time > next_range[0]:
  45. self.active.append(self.ranges.pop())
  46. def update_complete(self, start_time, end_time):
  47. update = []
  48. for r in self.active:
  49. if end_time > r[1]:
  50. continue
  51. else:
  52. update.append(r)
  53. self.active = update
  54. def extract_words(f_path):
  55. # there should be two transcripts for each json file
  56. # the 2nd transcripts contain speaker tag
  57. data = []
  58. with open(f_path) as ifile:
  59. json_out = json.load(ifile)
  60. transcripts = []
  61. cur_time = 0
  62. for res in json_out['results']:
  63. hyp = res['alternatives'][0]
  64. if not hyp:
  65. continue
  66. items = hyp['words']
  67. for item in items:
  68. end_time = float(item['endTime'].strip('s'))
  69. if end_time < cur_time:
  70. # when 1st transcripts is done, load 2nd ones
  71. data.append(transcripts)
  72. transcripts = []
  73. cur_time = end_time
  74. transcripts.append(item)
  75. if transcripts:
  76. data.append(transcripts)
  77. return data
  78. def get_ranges(last_item):
  79. end_time = int(float(last_item['endTime'].strip('s')))
  80. # offsets in output need to be multiples of 60
  81. ranges = list(range(0, end_time, 60))
  82. segment_ranges = [(t, t+120) for t in ranges]
  83. segment_ranges.reverse()
  84. return segment_ranges
  85. def extract_content(f_path):
  86. data = extract_words(f_path)
  87. if len(data) >= 3:
  88. print('{}: more than 3 transcription variants'.format(f_path),
  89. file=sys.stderr)
  90. # ignore the 2nd transcripts variant which contains SpeakerTag
  91. transcripts = data[0]
  92. segment_ranges = get_ranges(transcripts[-1])
  93. segmenter = Segmenter(segment_ranges, transcripts)
  94. segmenter.run()
  95. return segmenter.get_segment_dict()
  96. def write_output(show_id, segment_dict, out_dir):
  97. with open(os.path.join(out_dir, show_id+'.xml'), 'w') as ofile:
  98. for k in sorted(list(segment_dict.keys())):
  99. # Add show_id and time stamps for identifier
  100. doc_no = '{}_{}'.format(show_id, k)
  101. to_write = TEMPLATE.format(
  102. doc_no, ' '.join(segment_dict[k]))
  103. print(to_write, file=ofile)
  104. def main():
  105. args = parse_arguments()
  106. if not os.path.exists(args.out_dir):
  107. os.mkdir(args.out_dir)
  108. with open(args.file_list) as ifile:
  109. for line in ifile:
  110. f_path = line.rstrip('\n')
  111. f_name = os.path.basename(f_path).split('.')[0]
  112. try:
  113. segment_dict = extract_content(f_path)
  114. except Exception as e:
  115. print('{}: error while extracting '
  116. 'transcripts from json file'.format(f_path),
  117. file=sys.stderr)
  118. break
  119. write_output(f_name, segment_dict, args.out_dir)
  120. if __name__=="__main__":
  121. main()