xmlgat_to_EVT.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. # %%
  2. # Imports
  3. import xml.etree.ElementTree as ET
  4. import re
  5. import json
  6. import os
  7. # %%
  8. # Import lems list + xml info file
  9. basedir = '../../Data/'
  10. # lems
  11. lemfile = basedir + 'DallOVI/datiniXML/power_lemmarioB.json'
  12. lems = json.load(open(lemfile, 'r'))
  13. # BiblioDatini.xml
  14. infofile = basedir + 'DallOVI/datiniXML/BiblioDatini.xml'
  15. infotree = ET.parse(infofile)
  16. inforoot = infotree.getroot()
  17. infoBiblioNodeList = list(inforoot.iter('Biblio'))
  18. # %%
  19. print(type(lems))
  20. print(lems[:10])
  21. print('Main nodes in BiblioDatini.xml:', len(infoBiblioNodeList))
  22. # %%
  23. # Utils to extract data from the info files
  24. def lemIndex(lem):
  25. for item in lems:
  26. if lem.attrib['n'] in item['coordinate']:
  27. return item['id']
  28. raise ValueError("code " + lem.attrib['n'] + " not found")
  29. def getBiblioNodeBySigla(sigla):
  30. for node in infoBiblioNodeList:
  31. for child in node:
  32. if child.tag=='sigla' and child.text==sigla:
  33. return node
  34. # %%
  35. aa = getBiblioNodeBySigla('A03')
  36. ET.dump(aa)
  37. # %%
  38. # Import individual letter files
  39. indir = basedir + 'DallOVI/datiniXML/xmlgat/'
  40. outdir = basedir + 'DallOVI/datiniXML/xmlevt/'
  41. # %%
  42. # Lems in the xmlgat files have no children;
  43. # Single-word lems are in the tail of the corr. lem tags;
  44. # Multiple-word lems are in <w> tags immediately following the <lem>
  45. # The body of the text is inside a single <div>
  46. # FUNCTION TO PROCESS A FILE
  47. def processFile(indir, filecode):
  48. tree = ET.parse(indir + 'xmlgat.' + filecode + '.xml')
  49. root1 = tree.getroot()
  50. textbody = list(root1.iter('div'))[0]
  51. texttags = []
  52. for node in textbody:
  53. if(node.tag == 'lem' or node.tag == 'w'):
  54. texttags.append(node)
  55. worklist = []
  56. doit = False
  57. for node in texttags:
  58. if doit and node.tag=='w':
  59. worklist.append(('multiple-word', prev_node.attrib['n'], node.text))
  60. node.tag = 'NEWlem'
  61. node.attrib = prev_node.attrib
  62. prev_node.tag = 'OLDlem'
  63. if node.tag == 'lem' and node.tail != None:
  64. thelem = re.findall(r'\w+', node.tail)[0] # First word
  65. worklist.append(('single-word', node.attrib['n'], thelem))
  66. node.text = thelem
  67. node.tail = node.tail.replace(thelem, '')
  68. doit = False
  69. else:
  70. doit = True
  71. prev_node = node
  72. for node in textbody.findall('OLDlem'):
  73. textbody.remove(node)
  74. for node in textbody.findall('NEWlem'):
  75. node.tag = 'lem'
  76. for node in textbody.findall('lem'):
  77. node.attrib['n'] = filecode + '_' + node.attrib['n']
  78. for node in textbody.findall('lem'):
  79. node.attrib['ref'] = '#' + str(lemIndex(node))
  80. node.attrib.pop('n')
  81. for node in textbody.findall('lem'):
  82. ind = int(node.attrib['ref'][1:])
  83. if lems[ind]['lemma']['categoria']=='antr.':
  84. sb = ET.SubElement(node, 'persName')
  85. sb.text = node.text
  86. sb.attrib['ref'] = node.attrib['ref']
  87. node.text = ''
  88. else:
  89. if lems[ind]['lemma']['categoria']=='n.g.':
  90. sb = ET.SubElement(node, 'placeName')
  91. sb.text = node.text
  92. sb.attrib['ref'] = node.attrib['ref']
  93. node.text = ''
  94. return tree
  95. #%%
  96. # Example file
  97. filecodeexample = '99b'
  98. tree1 = processFile(indir, filecodeexample)
  99. tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml')
  100. # %%
  101. # Another example file
  102. filecodeexample = '80c'
  103. tree1 = processFile(indir, filecodeexample)
  104. tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml')
  105. # %%
  106. for file in os.listdir(indir):
  107. try:
  108. local_filecode = file.replace('xmlgat.', '').replace('.xml', '')
  109. local_tree = processFile(indir, local_filecode)
  110. local_tree.write(outdir + 'xmlevt-' + local_filecode + '.xml')
  111. except ET.ParseError:
  112. print("ParseError - " + file)
  113. except KeyError:
  114. print("KeyError - " + file)
  115. except IndexError:
  116. print("IndexError - " + file)
  117. print('DONE!')
  118. # %%
  119. filecodeexample = 'j91'
  120. tree2 = ET.parse(indir + 'xmlgat.' + filecodeexample + '.xml')
  121. # %%
  122. ET.dump(tree2)
  123. # %%
  124. tree3 = processFile(indir, filecodeexample)
  125. # %%
  126. ET.dump(tree3)
  127. # %%
  128. indir + 'xmlgat.' + filecodeexample + '.xml'
  129. # %%
  130. tempdir = "/home/kora/Desktop/FREELANCE_LOCAL/"
  131. # %%
  132. tree3.write(tempdir + 'xmlevt-' + filecodeexample + '.xml')
  133. # %%