# %% import xml.etree.ElementTree as ET import os import csv from collections import defaultdict import re import json # %% baseDir = '../../DATA/OVI/datiniXML/' # %% # PREREQUISITE # Used to standardize lems in Gatto output xml files for easier parsing def surroundLems(letterRoot): textRoot = list(letterRoot.iter('div'))[0] texttags = [node for node in textRoot if node.tag == 'lem' or node.tag == 'w'] doit = False for node in texttags: if doit and node.tag=='w': node.tag = 'NEWlem' node.attrib = prev_node.attrib prev_node.tag = 'OLDlem' if node.tag == 'lem' and node.tail != None: thelem = re.findall(r'\w+', node.tail)[0] # First word node.text = thelem node.tail = node.tail.replace(thelem, '') doit = False else: doit = True prev_node = node for node in textRoot.findall('OLDlem'): textRoot.remove(node) for node in textRoot.findall('NEWlem'): node.tag = 'lem' return textRoot # %% # Extract lems from Gatto xml files lemmiGatXml = {} # Output dict, storing lems from Gatto files; the keys are the OVI 'sigle' # basepath_gat = baseDir + 'xmlgat' for entry in os.listdir(basepath_gat): # loop on all files in the basepath_gat directory if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant gg = entry.split('.')[1] if gg != '': lemmiGatXml[gg]={"Filename": entry} for sigla, value in lemmiGatXml.items(): try: pluto = surroundLems(ET.parse(basepath_gat+'/'+value["Filename"]).getroot()) value["lemmi"]=[] for lem in pluto.iter('lem'): lemRef = {'lemma': lem.text, 'num_lemma': lem.attrib['n'], 'num_iperlemma': lem.attrib['type']} value["lemmi"].append(lemRef) except: print('Error in parsing file:', sigla) # %% lemmiGatTxt = [] basepath_lemmi = baseDir + 'lemmi_txt' for entry in os.listdir(basepath_lemmi): if os.path.isfile(os.path.join(basepath_lemmi, entry)): ll = entry.split('.')[1] if ll != '': lemmiGatTxt.append([ll, entry]) # %% xmlparse = ET.parse(baseDir + 'BiblioDatini.xml') root = xmlparse.getroot() biblio = root.findall("Biblio") sigle = [] for bib in biblio: sigla = bib.find("sigla") sigle.append(sigla.text.lower()) # %% OVI_data = open(baseDir + 'OVI_Data.csv', 'w') csvwriter = csv.writer(OVI_data) params = ["BiblioDatini", "lemmi_txt", "xmlgat"] csvwriter.writerow(params) for sigla in sigle: row = [sigla] no_lemma = " " lemma = " " gatto = " " row.append(no_lemma) for x in range(len(lemmiGatTxt)): if sigla == lemmiGatTxt[x][0]: lemma = lemmiGatTxt[x][1] row.append(lemma) try: gatto = lemmiGatXml[sigla]["File"] row.append(gatto) except KeyError: pass csvwriter.writerow(row) OVI_data.close() # %% iperlem_data = open(baseDir + 'lem_Data.csv', 'w') csvwriter = csv.writer(iperlem_data) params = ["sigla", "file", "num", "lemma", "iperlemma", "commento", "livello"] csvwriter.writerow(params) def write_lines(lines, sig, file): for line in lines: row = [sig, file] lem = re.split('\|', line) for l in lem: m = l.strip() row.append(m) try: gatLems = lemmiGatXml[sig]['lemmi'] thisGatLem = next(filter(lambda el: el['num_lemma']==row[2], gatLems), '') row.insert(4, thisGatLem['num_iperlemma']) except: row.insert(4, '') csvwriter.writerow(row) for x in range(len(lemmiGatTxt)): sigla = lemmiGatTxt[x][0] file_name = lemmiGatTxt[x][1] #Cambia percorso f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1') lines = f.readlines() write_lines(lines, sigla, file_name) f.close() iperlem_data.close() # %% redundantLemmas = [] def write_lines_here(lines, sig): toRet = [] for line in lines: row = [sig] lem = re.split('\|', line) for l in lem: m = l.strip() row.append(m) toRet.append(row) return toRet for x in range(len(lemmiGatTxt)): sigla = lemmiGatTxt[x][0] file_name = lemmiGatTxt[x][1] #Cambia percorso f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1') lines = f.readlines() redundantLemmas = redundantLemmas + write_lines_here(lines, sigla) f.close() print(len(redundantLemmas)) # %% preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas)) prefinal = list(map(lambda row: ((row[2], row[3], row[4]), (row[0], row[1])), preprefinal)) print(len(prefinal)) tmp = defaultdict(list) for k, v in prefinal: tmp[k].append(v) final = [{'lemma': {'forma_standard': k[0], 'categoria': k[1], 'note': k[2]}, 'coordinate': list(map(lambda el: {'file': el[0], 'n': el[1]}, v))} for k,v in tmp.items()] finalB = [{'lemma': {'forma_standard': k[0], 'categoria': k[1], 'note': k[2]}, 'coordinate': list(map(lambda el: el[0] + '_' + el[1], v))} for k,v in tmp.items()] final.sort(key=lambda el: el['lemma']['forma_standard']) finalB.sort(key=lambda el: el['lemma']['forma_standard']) print(len(final)) print(len(finalB)) # %% for ii, item in enumerate(finalB): item['id'] = ii # %% # IPERLEMMI preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas)) prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER)) finalIPER = list(set(prefinalIPER)) finalIPER.sort() # %% lem_data_unique = open(baseDir + 'lem_unique.csv', 'w') csvwriter = csv.writer(lem_data_unique) csvwriter.writerow(['lemma', 'categoria grammaticale', 'note']) for line in final: csvwriter.writerow(line) lem_data_unique.close() # IPERLEMMI iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w') csvwriter = csv.writer(iperlem_data_unique) csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note']) for line in finalIPER: #print(line) csvwriter.writerow(line) iperlem_data_unique.close() # %% #cat_gramm = set(map(lambda entry: entry[1], final)) #cat_gramm2 = list(cat_gramm) #cat_gramm2.sort() #print(cat_gramm2) # %% final[2] print(final[2]) # %% with open(baseDir + "power_lemmarioC.json", "w") as outfile: json.dump(finalB, outfile, indent=2) # %%