TEMPILO.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. # %%
  2. # Imports
  3. import xml.etree.ElementTree as ET
  4. import re
  5. import json
  6. # %%
  7. # Import lems list
  8. basedir = '/home/kora/Desktop/OVI_Data_Local/OVIaspo/DallOVI/datiniXML/'
  9. lems = json.load(open(basedir + 'power_lemmarioB.json', 'r'))
  10. # %%
  11. def lemIndex(lem):
  12. for item in lems:
  13. if lem.attrib['n'] in item['coordinate']:
  14. return item['id']
  15. else:
  16. raise ValueError()
  17. # %%
  18. # Import files
  19. # Example file
  20. filecodeexample = '99b'
  21. tree1 = ET.parse(basedir + 'xmlgat/' + 'xmlgat.' + filecodeexample + '.xml')
  22. root1 = tree1.getroot()
  23. # %%
  24. # Lems in the xmlgat files have no children;
  25. # Single-word lems are in the tail of the corr. lem tags;
  26. # Multiple-word lems are in <w> tags immediately following the <lem>
  27. # The body of the text is inside a single <div>
  28. # TRY TO PROCESS THE EXAMPLE FILE
  29. textbody = list(root1.iter('div'))[0]
  30. texttags = []
  31. for node in textbody:
  32. if(node.tag == 'lem' or node.tag == 'w'):
  33. texttags.append(node)
  34. print(len(texttags))
  35. ET.dump(textbody)
  36. # %%
  37. worklist = []
  38. doit = False
  39. for node in texttags:
  40. if doit and node.tag=='w':
  41. worklist.append(('multiple-word', prev_node.attrib['n'], node.text))
  42. node.tag = 'NEWlem'
  43. node.attrib = prev_node.attrib
  44. prev_node.tag = 'OLDlem'
  45. if node.tag == 'lem' and node.tail != None:
  46. thelem = re.findall(r'\w+', node.tail)[0] # First word
  47. worklist.append(('single-word', node.attrib['n'], thelem))
  48. node.text = thelem
  49. node.tail = node.tail.replace(thelem, '')
  50. doit = False
  51. else:
  52. doit = True
  53. prev_node = node
  54. for node in textbody.findall('OLDlem'):
  55. textbody.remove(node)
  56. for node in textbody.findall('NEWlem'):
  57. node.tag = 'lem'
  58. for node in textbody.findall('lem'):
  59. node.attrib['n'] = filecodeexample + '_' + node.attrib['n']
  60. ET.dump(textbody)
  61. # %%
  62. for node in textbody.findall('lem'):
  63. node.attrib['ref'] = '#' + str(lemIndex(node))
  64. node.attrib.pop('n')
  65. ET.dump(textbody)
  66. # %%
  67. for node in textbody.findall('lem'):
  68. ind = int(node.attrib['ref'][1:])
  69. if lems[ind]['lemma']['categoria']=='antr.':
  70. sb = ET.SubElement(node, 'persName')
  71. sb.text = node.text
  72. sb.attrib['ref'] = node.attrib['ref']
  73. node.text = ''
  74. else:
  75. if lems[ind]['lemma']['categoria']=='n.g.':
  76. sb = ET.SubElement(node, 'placeName')
  77. sb.text = node.text
  78. sb.attrib['ref'] = node.attrib['ref']
  79. node.text = ''
  80. ET.dump(textbody)
  81. tree1.write(basedir + 'prova.xml')
  82. # %%
  83. oviPlaces = [{'id': item['id'], 'lemma': item['lemma']} for item in list(filter( lambda el: el['lemma']['categoria']=='n.g.', lems ))]
  84. # %%
  85. with open(basedir + "ovi_places.json", "w") as outfile:
  86. json.dump(oviPlaces, outfile, indent=2)
  87. # %%
  88. oviNames = [{'id': item['id'], 'lemma': item['lemma']} for item in list(filter( lambda el: el['lemma']['categoria']=='antr.', lems ))]
  89. # %%
  90. with open(basedir + "ovi_names.json", "w") as outfile:
  91. json.dump(oviNames, outfile, indent=2)
  92. # %%
  93. print(len(oviPlaces))
  94. print(len(oviNames))
  95. # %%