TEAMOVI
/
Parser


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
							# %%
import xml.etree.ElementTree as ET
import os
import csv
from collections import defaultdict
import re
import json
# %%
baseDir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/OVI/datiniXML/'
# %%
# PREREQUISITE
# Used to standardize lems in Gatto output xml files for easier parsing
def surroundLems(letterRoot):

    textRoot = list(letterRoot.iter('div'))[0]

    texttags = [node for node in textRoot if node.tag == 'lem' or node.tag == 'w']

    doit = False
    for node in texttags:
        if doit and node.tag=='w':
            node.tag = 'NEWlem'
            node.attrib = prev_node.attrib
            prev_node.tag = 'OLDlem'
        if node.tag == 'lem' and node.tail != None:
            thelem = re.findall(r'\w+', node.tail)[0] # First word
            node.text = thelem
            node.tail = node.tail.replace(thelem, '')
            doit = False
        else:
            doit = True
        prev_node = node
    for node in textRoot.findall('OLDlem'):
        textRoot.remove(node)
    for node in textRoot.findall('NEWlem'):
        node.tag = 'lem'
    return textRoot
# %%
# Extract lems from Gatto xml files
lemmiGatXml = {} # Output dict, storing lems from Gatto files; the keys are the OVI 'sigle'
#
basepath_gat_xml = baseDir + 'xmlgat'
for entry in os.listdir(basepath_gat_xml): # loop on all files in the basepath_gat directory
    if os.path.isfile(os.path.join(basepath_gat_xml, entry)): # prolly redundant
        gg = entry.split('.')[1]
        if gg != '':
            lemmiGatXml[gg]={"Filename": entry}

parsingProblems = []
for sigla, value in lemmiGatXml.items(): 
    try:
        value["lemmi"]=[]
        pluto = surroundLems(ET.parse(os.path.join(basepath_gat_xml, value["Filename"])).getroot())
        for lem in pluto.iter('lem'):
            lemRef = {'lemma': lem.text, 'num_lemma': lem.attrib['n'], 'num_iperlemma': lem.attrib['type']}
            value["lemmi"].append(lemRef)
    except: 
        print('Error in parsing sigla:', sigla)
        parsingProblems.append(sigla)
# %%
# Extract lems from Gatto txt files
lemmiGatTxt = {}
#
basepath_gat_txt = baseDir + 'lemmi_txt'
for entry in os.listdir(basepath_gat_txt):
    if os.path.isfile(os.path.join(basepath_gat_txt, entry)):
        ll = entry.split('.')[1]
        if ll != '':
            lemmiGatTxt[ll]={"Filename": entry}

for sigla, value in lemmiGatTxt.items():
    #Cambia percorso
    value["lemmi"]=[]
    f = open(os.path.join(basepath_gat_txt, value["Filename"]), "r", encoding='latin-1')
    lines = f.readlines()
    for line in lines:
        prelem = re.split('\|', line)
        lem = [el.strip() for el in prelem]
        value["lemmi"].append(lem)
# %%
print(lemmiGatTxt['l95'])
print(lemmiGatXml['l95'])
# %%
# Do a redundant list of all lemmas (with repetitions) from the files
redundantLemmas = []

for sigla, valueTxt in lemmiGatTxt.items():
    valueXml = lemmiGatXml[sigla]
    iperLemmiLocal = list(filter(lambda row: 'IPERLEMMA' in row[0], valueTxt['lemmi']))
    for lemTxt in valueTxt['lemmi']:
        newLemTxt = lemTxt.copy()
        newLemTxt.insert(0, sigla)
        lemXml = next( filter(lambda el: el['num_lemma']==lemTxt[0], valueXml['lemmi']) , None)
        if lemXml is not None and lemXml['num_iperlemma']!='0':
            num_iperlemma = int(lemXml['num_iperlemma'])
            iperlemma = iperLemmiLocal[num_iperlemma-1]
            newLemTxt.append(iperlemma[1])
        else:
            newLemTxt.append('')
        redundantLemmas.append(newLemTxt)
# %%
# From 'redundantLemmas' generate a formatted json object without repetitions
preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas))
prefinal = list(map(lambda row: ((row[2], row[3], row[4], row[5]), (row[0], row[1])), preprefinal))

print(len(prefinal))

tmp = defaultdict(list)
for k, v in prefinal: tmp[k].append(v)

finalC = [{'lemma': {'forma_standard': k[0], 'categoria': k[1], 'note': k[2], 'iperlemma': k[3]}, 'coordinate': list(map(lambda el: el[0] + '_' + el[1], v))} for k,v in tmp.items()]
finalC.sort(key=lambda el: el['lemma']['forma_standard'])
print(len(finalC))

for ii, item in enumerate(finalC):
    item['id'] = ii
# %%
# Export the json to file
with open(baseDir + "power_lemmarioC.json", "w") as outfile:
    json.dump(finalC, outfile, indent=2)
# %%

# IPERLEMMI - DA RIPENSARE
#preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas))
#prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER))
#
#finalIPER = list(set(prefinalIPER))
#finalIPER.sort()
# %%
#iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w')
#csvwriter = csv.writer(iperlem_data_unique)
#
#csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note'])
#
#for line in finalIPER:
#    csvwriter.writerow(line)
#
#iperlem_data_unique.close()