123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179 |
- # %%
- import xml.etree.ElementTree as ET
- import os
- import csv
- from collections import OrderedDict, defaultdict
- import re
- import json
- # %%
- baseDir = '/Users/federicaspinelli/TEAMOVI/Parser/Data/OVI/datiniXML/'
- # %%
- gat = []
- basepath_gat = baseDir + 'xmlgat'
- for entry in os.listdir(basepath_gat):
- if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant
- gg = entry.split('.')[1]
- if gg != '':
- gat.append([gg, entry])
- lemmi = []
- basepath_lemmi = baseDir + 'lemmi_txt'
- for entry in os.listdir(basepath_lemmi):
- if os.path.isfile(os.path.join(basepath_lemmi, entry)):
- ll = entry.split('.')[1]
- if ll != '':
- lemmi.append([ll, entry])
- # %%
- xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')
- root = xmlparse.getroot()
- biblio = root.findall("Biblio")
- sigle = []
- for bib in biblio:
- sigla = bib.find("sigla")
- sigle.append(sigla.text.lower())
- # %%
- OVI_data = open(baseDir + 'OVI_Data.csv', 'w')
- csvwriter = csv.writer(OVI_data)
- params = ["BiblioDatini", "lemmi_txt", "xmlgat"]
- csvwriter.writerow(params)
- for sigla in sigle:
- row = [sigla]
- no_lemma = " "
- lemma = " "
- gatto = " "
- row.append(no_lemma)
- for x in range(len(lemmi)):
- if sigla == lemmi[x][0]:
- lemma = lemmi[x][1]
- row.append(lemma)
- for x in range(len(gat)):
- if sigla == gat[x][0]:
- gatto = gat[x][1]
- row.append(gatto)
- csvwriter.writerow(row)
- OVI_data.close()
- # %%
- iperlem_data = open(baseDir + 'lem_Data.csv', 'w')
- csvwriter = csv.writer(iperlem_data)
- params = ["sigla", "file", "num", "lemma", "commento", "livello"]
- csvwriter.writerow(params)
- def write_lines(lines, sig, file):
- for line in lines:
- row = [sig, file]
- lem = re.split('\|', line)
- for l in lem:
- m = l.strip()
- row.append(m)
- csvwriter.writerow(row)
- for x in range(len(lemmi)):
- sigla = lemmi[x][0]
- file_name = lemmi[x][1]
- #Cambia percorso
- f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
- lines = f.readlines()
- write_lines(lines, sigla, file_name)
- f.close()
- iperlem_data.close()
- # %%
- redundantLemmas = []
- def write_lines_here(lines, sig):
- toRet = []
- for line in lines:
- row = [sig]
- lem = re.split('\|', line)
- for l in lem:
- m = l.strip()
- row.append(m)
- toRet.append(row)
- return toRet
- for x in range(len(lemmi)):
- sigla = lemmi[x][0]
- file_name = lemmi[x][1]
- #Cambia percorso
- f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
- lines = f.readlines()
- redundantLemmas = redundantLemmas + write_lines_here(lines, sigla)
- f.close()
- print(len(redundantLemmas))
- # %%
- preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas))
- prefinal = list(map(lambda row: ((row[2], row[3], row[4]), (row[0], row[1])), preprefinal))
- print(len(prefinal))
- tmp = defaultdict(list)
- for k, v in prefinal: tmp[k].append(v)
- final = [{'lemma': {'forma_standard': k[0], 'categoria': k[1], 'note': k[2]}, 'coordinate': list(map(lambda el: {'file': el[0], 'n': el[1]}, v))} for k,v in tmp.items()]
- finalB = [{'lemma': {'forma_standard': k[0], 'categoria': k[1], 'note': k[2]}, 'coordinate': list(map(lambda el: el[0] + '_' + el[1], v))} for k,v in tmp.items()]
- final.sort(key=lambda el: el['lemma']['forma_standard'])
- finalB.sort(key=lambda el: el['lemma']['forma_standard'])
- print(len(final))
- print(len(finalB))
- # %%
- for ii, item in enumerate(finalB):
- item['id'] = ii
- # %%
- # IPERLEMMI
- preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas))
- prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER))
- finalIPER = list(set(prefinalIPER))
- finalIPER.sort()
- # %%
- lem_data_unique = open(baseDir + 'lem_unique.csv', 'w')
- csvwriter = csv.writer(lem_data_unique)
- csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])
- for line in final:
- csvwriter.writerow(line)
- lem_data_unique.close()
- # IPERLEMMI
- iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w')
- csvwriter = csv.writer(iperlem_data_unique)
- csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note'])
- for line in finalIPER:
- #print(line)
- csvwriter.writerow(line)
- iperlem_data_unique.close()
- # %%
- #cat_gramm = set(map(lambda entry: entry[1], final))
- #cat_gramm2 = list(cat_gramm)
- #cat_gramm2.sort()
- #print(cat_gramm2)
- # %%
- final[2]
- # %%
- with open(baseDir + "power_lemmario.json", "w") as outfile:
- json.dump(final, outfile, indent=2)
- # %%
- with open(baseDir + "power_lemmarioB.json", "w") as outfile:
- json.dump(finalB, outfile, indent=2)
- # %%
|