# %% import xml.etree.ElementTree as ET import os import csv from collections import defaultdict import re import json # %% baseDir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/OVI/datiniXML/' # %% # PREREQUISITE # Used to standardize lems in Gatto output xml files for easier parsing def surroundLems(letterRoot): textRoot = list(letterRoot.iter('div'))[0] texttags = [node for node in textRoot if node.tag == 'lem' or node.tag == 'w'] doit = False for node in texttags: if doit and node.tag=='w': node.tag = 'NEWlem' node.attrib = prev_node.attrib prev_node.tag = 'OLDlem' if node.tag == 'lem' and node.tail != None: thelem = re.findall(r'\w+', node.tail)[0] # First word node.text = thelem node.tail = node.tail.replace(thelem, '') doit = False else: doit = True prev_node = node for node in textRoot.findall('OLDlem'): textRoot.remove(node) for node in textRoot.findall('NEWlem'): node.tag = 'lem' return textRoot # %% # Extract lems from Gatto xml files lemmiGatXml = {} # Output dict, storing lems from Gatto files; the keys are the OVI 'sigle' # basepath_gat_xml = baseDir + 'xmlgat' for entry in os.listdir(basepath_gat_xml): # loop on all files in the basepath_gat directory if os.path.isfile(os.path.join(basepath_gat_xml, entry)): # prolly redundant gg = entry.split('.')[1] if gg != '': lemmiGatXml[gg]={"Filename": entry} parsingProblems = [] for sigla, value in lemmiGatXml.items(): try: value["lemmi"]=[] pluto = surroundLems(ET.parse(os.path.join(basepath_gat_xml, value["Filename"])).getroot()) for lem in pluto.iter('lem'): lemRef = {'lemma': lem.text, 'num_lemma': lem.attrib['n'], 'num_iperlemma': lem.attrib['type']} value["lemmi"].append(lemRef) except: print('Error in parsing sigla:', sigla) parsingProblems.append(sigla) # %% # Extract lems from Gatto txt files lemmiGatTxt = {} # basepath_gat_txt = baseDir + 'lemmi_txt' for entry in os.listdir(basepath_gat_txt): if os.path.isfile(os.path.join(basepath_gat_txt, entry)): ll = entry.split('.')[1] if ll != '': lemmiGatTxt[ll]={"Filename": entry} for sigla, value in lemmiGatTxt.items(): #Cambia percorso value["lemmi"]=[] f = open(os.path.join(basepath_gat_txt, value["Filename"]), "r", encoding='latin-1') lines = f.readlines() for line in lines: prelem = re.split('\|', line) lem = [el.strip() for el in prelem] value["lemmi"].append(lem) # %% print(lemmiGatTxt['l95']) print(lemmiGatXml['l95']) # %% # Do a redundant list of all lemmas (with repetitions) from the files redundantLemmas = [] for sigla, valueTxt in lemmiGatTxt.items(): valueXml = lemmiGatXml[sigla] iperLemmiLocal = list(filter(lambda row: 'IPERLEMMA' in row[0], valueTxt['lemmi'])) for lemTxt in valueTxt['lemmi']: newLemTxt = lemTxt.copy() newLemTxt.insert(0, sigla) lemXml = next( filter(lambda el: el['num_lemma']==lemTxt[0], valueXml['lemmi']) , None) if lemXml is not None and lemXml['num_iperlemma']!='0': num_iperlemma = int(lemXml['num_iperlemma']) iperlemma = iperLemmiLocal[num_iperlemma-1] newLemTxt.append(iperlemma[1]) else: newLemTxt.append('') redundantLemmas.append(newLemTxt) # %% # From 'redundantLemmas' generate a formatted json object without repetitions preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas)) prefinal = list(map(lambda row: ((row[2], row[3], row[4], row[5]), (row[0], row[1])), preprefinal)) print(len(prefinal)) tmp = defaultdict(list) for k, v in prefinal: tmp[k].append(v) finalC = [{'lemma': {'forma_standard': k[0], 'categoria': k[1], 'note': k[2], 'iperlemma': k[3]}, 'coordinate': list(map(lambda el: el[0] + '_' + el[1], v))} for k,v in tmp.items()] finalC.sort(key=lambda el: el['lemma']['forma_standard']) print(len(finalC)) for ii, item in enumerate(finalC): item['id'] = ii # %% #%% # Export the json to file with open(baseDir + "power_lemmarioD.json", "w") as outfile: json.dump(finalC, outfile, indent=2) # %% # IPERLEMMI - DA RIPENSARE #preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas)) #prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER)) # #finalIPER = list(set(prefinalIPER)) #finalIPER.sort() # %% #iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w') #csvwriter = csv.writer(iperlem_data_unique) # #csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note']) # #for line in finalIPER: # csvwriter.writerow(line) # #iperlem_data_unique.close()