# %% # Imports import xml.etree.ElementTree as ET import re import json import os # %% # Import lems list + xml info file basedir = '../../Data/' # lems lemfile = basedir + 'DallOVI/datiniXML/power_lemmarioB.json' lems = json.load(open(lemfile, 'r')) # BiblioDatini.xml infofile = basedir + 'DallOVI/datiniXML/BiblioDatini.xml' infotree = ET.parse(infofile) inforoot = infotree.getroot() infoBiblioNodeList = list(inforoot.iter('Biblio')) # %% print(type(lems)) print(lems[:10]) print('Main nodes in BiblioDatini.xml:', len(infoBiblioNodeList)) # %% # Utils to extract data from the info files def lemIndex(lem): for item in lems: if lem.attrib['n'] in item['coordinate']: return item['id'] raise ValueError("code " + lem.attrib['n'] + " not found") def getBiblioNodeBySigla(sigla): for node in infoBiblioNodeList: for child in node: if child.tag=='sigla' and child.text==sigla: return node # %% aa = getBiblioNodeBySigla('A03') ET.dump(aa) # %% # Import individual letter files indir = basedir + 'DallOVI/datiniXML/xmlgat/' outdir = basedir + 'DallOVI/datiniXML/xmlevt/' # %% # Lems in the xmlgat files have no children; # Single-word lems are in the tail of the corr. lem tags; # Multiple-word lems are in tags immediately following the # The body of the text is inside a single
# FUNCTION TO PROCESS A FILE def processFile(indir, filecode): tree = ET.parse(indir + 'xmlgat.' + filecode + '.xml') root1 = tree.getroot() textbody = list(root1.iter('div'))[0] texttags = [] for node in textbody: if(node.tag == 'lem' or node.tag == 'w'): texttags.append(node) worklist = [] doit = False for node in texttags: if doit and node.tag=='w': worklist.append(('multiple-word', prev_node.attrib['n'], node.text)) node.tag = 'NEWlem' node.attrib = prev_node.attrib prev_node.tag = 'OLDlem' if node.tag == 'lem' and node.tail != None: thelem = re.findall(r'\w+', node.tail)[0] # First word worklist.append(('single-word', node.attrib['n'], thelem)) node.text = thelem node.tail = node.tail.replace(thelem, '') doit = False else: doit = True prev_node = node for node in textbody.findall('OLDlem'): textbody.remove(node) for node in textbody.findall('NEWlem'): node.tag = 'lem' for node in textbody.findall('lem'): node.attrib['n'] = filecode + '_' + node.attrib['n'] for node in textbody.findall('lem'): node.attrib['ref'] = '#' + str(lemIndex(node)) node.attrib.pop('n') for node in textbody.findall('lem'): ind = int(node.attrib['ref'][1:]) if lems[ind]['lemma']['categoria']=='antr.': sb = ET.SubElement(node, 'persName') sb.text = node.text sb.attrib['ref'] = node.attrib['ref'] node.text = '' else: if lems[ind]['lemma']['categoria']=='n.g.': sb = ET.SubElement(node, 'placeName') sb.text = node.text sb.attrib['ref'] = node.attrib['ref'] node.text = '' return tree #%% # Example file filecodeexample = '99b' tree1 = processFile(indir, filecodeexample) tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml') # %% # Another example file filecodeexample = '80c' tree1 = processFile(indir, filecodeexample) tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml') # %% for file in os.listdir(indir): try: local_filecode = file.replace('xmlgat.', '').replace('.xml', '') local_tree = processFile(indir, local_filecode) local_tree.write(outdir + 'xmlevt-' + local_filecode + '.xml') except ET.ParseError: print("ParseError - " + file) except KeyError: print("KeyError - " + file) except IndexError: print("IndexError - " + file) print('DONE!') # %% filecodeexample = 'j91' tree2 = ET.parse(indir + 'xmlgat.' + filecodeexample + '.xml') # %% ET.dump(tree2) # %% tree3 = processFile(indir, filecodeexample) # %% ET.dump(tree3) # %% indir + 'xmlgat.' + filecodeexample + '.xml' # %% tempdir = "/home/kora/Desktop/FREELANCE_LOCAL/" # %% tree3.write(tempdir + 'xmlevt-' + filecodeexample + '.xml') # %%