xmlgat_to_EVT.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # %%
  2. # Imports
  3. import xml.etree.ElementTree as ET
  4. import re
  5. import json
  6. import os
  7. # %%
  8. # Import lems list + info file + authority files
  9. basedir = '/home/kora/Desktop/OVI_Data/Development/Parser/Data/'
  10. # lems
  11. lems = json.load(open(basedir + 'DallOVI/datiniXML/power_lemmarioB.json', 'r'))
  12. # %%
  13. def lemIndex(lem):
  14. for item in lems:
  15. if lem.attrib['n'] in item['coordinate']:
  16. return item['id']
  17. raise ValueError("code " + lem.attrib['n'] + " not found")
  18. # %%
  19. # Import individual letter files
  20. indir = basedir + 'DallOVI/datiniXML/xmlgat/'
  21. outdir = basedir + 'DallOVI/datiniXML/xmlevt/'
  22. # %%
  23. # Lems in the xmlgat files have no children;
  24. # Single-word lems are in the tail of the corr. lem tags;
  25. # Multiple-word lems are in <w> tags immediately following the <lem>
  26. # The body of the text is inside a single <div>
  27. # FUNCTION TO PROCESS A FILE
  28. def processFile(indir, filecode):
  29. tree = ET.parse(indir + 'xmlgat.' + filecode + '.xml')
  30. root1 = tree.getroot()
  31. textbody = list(root1.iter('div'))[0]
  32. texttags = []
  33. for node in textbody:
  34. if(node.tag == 'lem' or node.tag == 'w'):
  35. texttags.append(node)
  36. worklist = []
  37. doit = False
  38. for node in texttags:
  39. if doit and node.tag=='w':
  40. worklist.append(('multiple-word', prev_node.attrib['n'], node.text))
  41. node.tag = 'NEWlem'
  42. node.attrib = prev_node.attrib
  43. prev_node.tag = 'OLDlem'
  44. if node.tag == 'lem' and node.tail != None:
  45. thelem = re.findall(r'\w+', node.tail)[0] # First word
  46. worklist.append(('single-word', node.attrib['n'], thelem))
  47. node.text = thelem
  48. node.tail = node.tail.replace(thelem, '')
  49. doit = False
  50. else:
  51. doit = True
  52. prev_node = node
  53. for node in textbody.findall('OLDlem'):
  54. textbody.remove(node)
  55. for node in textbody.findall('NEWlem'):
  56. node.tag = 'lem'
  57. for node in textbody.findall('lem'):
  58. node.attrib['n'] = filecode + '_' + node.attrib['n']
  59. for node in textbody.findall('lem'):
  60. node.attrib['ref'] = '#' + str(lemIndex(node))
  61. node.attrib.pop('n')
  62. for node in textbody.findall('lem'):
  63. ind = int(node.attrib['ref'][1:])
  64. if lems[ind]['lemma']['categoria']=='antr.':
  65. sb = ET.SubElement(node, 'persName')
  66. sb.text = node.text
  67. sb.attrib['ref'] = node.attrib['ref']
  68. node.text = ''
  69. else:
  70. if lems[ind]['lemma']['categoria']=='n.g.':
  71. sb = ET.SubElement(node, 'placeName')
  72. sb.text = node.text
  73. sb.attrib['ref'] = node.attrib['ref']
  74. node.text = ''
  75. return tree
  76. #%%
  77. # Example file
  78. filecodeexample = '99b'
  79. tree1 = processFile(indir, filecodeexample)
  80. tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml')
  81. # %%
  82. # Another example file
  83. filecodeexample = '80c'
  84. tree1 = processFile(indir, filecodeexample)
  85. tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml')
  86. # %%
  87. for file in os.listdir(indir):
  88. try:
  89. local_filecode = file.replace('xmlgat.', '').replace('.xml', '')
  90. local_tree = processFile(indir, local_filecode)
  91. local_tree.write(outdir + 'xmlevt-' + local_filecode + '.xml')
  92. except ET.ParseError:
  93. print("ParseError - " + file)
  94. except KeyError:
  95. print("KeyError - " + file)
  96. except IndexError:
  97. print("IndexError - " + file)
  98. print('DONE!')
  99. # %%
  100. filecodeexample = 'j92'
  101. tree2 = ET.parse(indir + 'xmlgat.' + filecodeexample + '.xml')
  102. # %%