xmlgat_to_EVT_temp.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # %%
  2. # Imports
  3. import xml.etree.ElementTree as ET
  4. import re
  5. import csv
  6. import json
  7. # %%
  8. # Import lems list + info file + authority files
  9. basedir = '/home/kora/Desktop/OVI_Data/Development/Parser/Data/'
  10. # lems
  11. lems = json.load(open(basedir + 'DallOVI/datiniXML/power_lemmarioB.json', 'r'))
  12. # datini people EAC
  13. with open(basedir + 'DallASPO/data_eac_datini.csv') as infile:
  14. reader = csv.DictReader(infile)
  15. data_eac = [row for row in reader]
  16. # datini OVI-ASPO data
  17. with open(basedir + 'FULL_MERGED.csv') as infile:
  18. reader = csv.DictReader(infile)
  19. datini_oviaspo = [row for row in reader]
  20. # %%
  21. def lemIndex(lem):
  22. for item in lems:
  23. if lem.attrib['n'] in item['coordinate']:
  24. return item['id']
  25. else:
  26. raise ValueError()
  27. # %%
  28. # Import individual letter files
  29. # Example file
  30. filecodeexample = '99b'
  31. tree1 = ET.parse(basedir + 'DallOVI/datiniXML/xmlgat/' + 'xmlgat.' + filecodeexample + '.xml')
  32. root1 = tree1.getroot()
  33. # %%
  34. # Lems in the xmlgat files have no children;
  35. # Single-word lems are in the tail of the corr. lem tags;
  36. # Multiple-word lems are in <w> tags immediately following the <lem>
  37. # The body of the text is inside a single <div>
  38. # TRY TO PROCESS THE EXAMPLE FILE
  39. textbody = list(root1.iter('div'))[0]
  40. texttags = []
  41. for node in textbody:
  42. if(node.tag == 'lem' or node.tag == 'w'):
  43. texttags.append(node)
  44. print(len(texttags))
  45. ET.dump(textbody)
  46. # %%
  47. worklist = []
  48. doit = False
  49. for node in texttags:
  50. if doit and node.tag=='w':
  51. worklist.append(('multiple-word', prev_node.attrib['n'], node.text))
  52. node.tag = 'NEWlem'
  53. node.attrib = prev_node.attrib
  54. prev_node.tag = 'OLDlem'
  55. if node.tag == 'lem' and node.tail != None:
  56. thelem = re.findall(r'\w+', node.tail)[0] # First word
  57. worklist.append(('single-word', node.attrib['n'], thelem))
  58. node.text = thelem
  59. node.tail = node.tail.replace(thelem, '')
  60. doit = False
  61. else:
  62. doit = True
  63. prev_node = node
  64. for node in textbody.findall('OLDlem'):
  65. textbody.remove(node)
  66. for node in textbody.findall('NEWlem'):
  67. node.tag = 'lem'
  68. for node in textbody.findall('lem'):
  69. node.attrib['n'] = filecodeexample + '_' + node.attrib['n']
  70. ET.dump(textbody)
  71. # %%
  72. for node in textbody.findall('lem'):
  73. node.attrib['ref'] = '#' + str(lemIndex(node))
  74. node.attrib.pop('n')
  75. ET.dump(textbody)
  76. # %%
  77. for node in textbody.findall('lem'):
  78. ind = int(node.attrib['ref'][1:])
  79. if lems[ind]['lemma']['categoria']=='antr.':
  80. sb = ET.SubElement(node, 'persName')
  81. sb.text = node.text
  82. sb.attrib['ref'] = node.attrib['ref']
  83. node.text = ''
  84. else:
  85. if lems[ind]['lemma']['categoria']=='n.g.':
  86. sb = ET.SubElement(node, 'placeName')
  87. sb.text = node.text
  88. sb.attrib['ref'] = node.attrib['ref']
  89. node.text = ''
  90. ET.dump(textbody)
  91. tree1.write(basedir + 'prova.xml')
  92. # %%
  93. oviPlaces = [{'id': item['id'], 'lemma': item['lemma']} for item in list(filter( lambda el: el['lemma']['categoria']=='n.g.', lems ))]
  94. # %%
  95. with open(basedir + "ovi_places.json", "w") as outfile:
  96. json.dump(oviPlaces, outfile, indent=2)
  97. # %%
  98. oviNames = [{'id': item['id'], 'lemma': item['lemma']} for item in list(filter( lambda el: el['lemma']['categoria']=='antr.', lems ))]
  99. # %%
  100. with open(basedir + "ovi_names.json", "w") as outfile:
  101. json.dump(oviNames, outfile, indent=2)
  102. # %%
  103. print(len(oviPlaces))
  104. print(len(oviNames))
  105. # %%