# %% # Imports import xml.etree.ElementTree as ET import re import json # %% # Import lems list basedir = '/home/kora/Desktop/OVI_Data_Local/OVIaspo/DallOVI/datiniXML/' lems = json.load(open(basedir + 'power_lemmarioB.json', 'r')) # %% def lemIndex(lem): for item in lems: if lem.attrib['n'] in item['coordinate']: return item['id'] else: raise ValueError() # %% # Import files # Example file filecodeexample = '99b' tree1 = ET.parse(basedir + 'xmlgat/' + 'xmlgat.' + filecodeexample + '.xml') root1 = tree1.getroot() # %% # Lems in the xmlgat files have no children; # Single-word lems are in the tail of the corr. lem tags; # Multiple-word lems are in tags immediately following the # The body of the text is inside a single
# TRY TO PROCESS THE EXAMPLE FILE textbody = list(root1.iter('div'))[0] texttags = [] for node in textbody: if(node.tag == 'lem' or node.tag == 'w'): texttags.append(node) print(len(texttags)) ET.dump(textbody) # %% worklist = [] doit = False for node in texttags: if doit and node.tag=='w': worklist.append(('multiple-word', prev_node.attrib['n'], node.text)) node.tag = 'NEWlem' node.attrib = prev_node.attrib prev_node.tag = 'OLDlem' if node.tag == 'lem' and node.tail != None: thelem = re.findall(r'\w+', node.tail)[0] # First word worklist.append(('single-word', node.attrib['n'], thelem)) node.text = thelem node.tail = node.tail.replace(thelem, '') doit = False else: doit = True prev_node = node for node in textbody.findall('OLDlem'): textbody.remove(node) for node in textbody.findall('NEWlem'): node.tag = 'lem' for node in textbody.findall('lem'): node.attrib['n'] = filecodeexample + '_' + node.attrib['n'] ET.dump(textbody) # %% for node in textbody.findall('lem'): node.attrib['ref'] = '#' + str(lemIndex(node)) node.attrib.pop('n') ET.dump(textbody) # %% for node in textbody.findall('lem'): ind = int(node.attrib['ref'][1:]) if lems[ind]['lemma']['categoria']=='antr.': sb = ET.SubElement(node, 'persName') sb.text = node.text sb.attrib['ref'] = node.attrib['ref'] node.text = '' else: if lems[ind]['lemma']['categoria']=='n.g.': sb = ET.SubElement(node, 'placeName') sb.text = node.text sb.attrib['ref'] = node.attrib['ref'] node.text = '' ET.dump(textbody) tree1.write(basedir + 'prova.xml') # %% oviPlaces = [{'id': item['id'], 'lemma': item['lemma']} for item in list(filter( lambda el: el['lemma']['categoria']=='n.g.', lems ))] # %% with open(basedir + "ovi_places.json", "w") as outfile: json.dump(oviPlaces, outfile, indent=2) # %% oviNames = [{'id': item['id'], 'lemma': item['lemma']} for item in list(filter( lambda el: el['lemma']['categoria']=='antr.', lems ))] # %% with open(basedir + "ovi_names.json", "w") as outfile: json.dump(oviNames, outfile, indent=2) # %% print(len(oviPlaces)) print(len(oviNames)) # %%