import xml.etree.ElementTree as ET
import os
import csv
baseDir = '/home/kora/Desktop/OVI_Data_local/200_DATI_OVI/dati/'
gat = []
basepath_gat = baseDir + 'xmlgat'
for entry in os.listdir(basepath_gat):
if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant
gg = entry.split('.')[1]
if gg != '':
gat.append([gg, entry])
nolemmi = []
basepath_nolemmi = baseDir + 'DatiniXML_incompleto'
for entry in os.listdir(basepath_nolemmi):
if os.path.isfile(os.path.join(basepath_nolemmi, entry)):
nn = entry.split('.')[1]
if nn != '':
nolemmi.append([nn, entry])
lemmi = []
basepath_lemmi = baseDir + 'lemmi'
for entry in os.listdir(basepath_lemmi):
if os.path.isfile(os.path.join(basepath_lemmi, entry)):
ll = entry.split('.')[1]
if ll != '':
lemmi.append([ll, entry])
xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')
root = xmlparse.getroot()
biblio = root.findall("Biblio")
sigle = []
for bib in biblio:
sigla = bib.find("sigla")
sigle.append(sigla.text.lower())
OVI_data = open(baseDir + 'OVI_Data.csv', 'w')
csvwriter = csv.writer(OVI_data)
params = ["BiblioDatini", "nolemmi", "lemmi", "xmlgat"]
csvwriter.writerow(params)
for sigla in sigle:
row = [sigla]
no_lemma = " "
lemma = " "
gatto = " "
for x in range(len(nolemmi)):
if sigla in nolemmi[x][0]:
no_lemma = nolemmi[x][1]
row.append(no_lemma)
for x in range(len(lemmi)):
if sigla == lemmi[x][0]:
lemma = lemmi[x][1]
row.append(lemma)
for x in range(len(gat)):
if sigla == gat[x][0]:
gatto = gat[x][1]
row.append(gatto)
csvwriter.writerow(row)
OVI_data.close()
#Cambiare percorsi file
#xml_file_name = 'Ovi/BiblioDatini.xml'
#tree = Xet.parse(xml_file_name)
#root = tree.getroot() --> già definito
#biblio = root.findall("Biblio") --> già definito
Datini_data = open(baseDir + 'Datini_Data.csv', 'w')
csvwriter = csv.writer(Datini_data)
elemList = []
for elem in root.iter():
elemList.append(elem.tag)
elemList = list(set(elemList))
elemList.remove("dataroot")
elemList.remove("Biblio")
param = elemList
csvwriter.writerow(param)
def cell(p, arr):
if arr.find(p) is None:
res = " "
else:
res = arr.find(p).text
return res
for scheda in biblio:
aut = []
for par in param:
if par == "star_note":
if scheda.find(".//star_note") is None:
r = " "
else:
r = "True"
else:
r = cell(par, scheda)
aut.append(r)
csvwriter.writerow(aut)
Datini_data.close()
import re
iperlem_data = open(baseDir + 'lem_Data.csv', 'w')
csvwriter = csv.writer(iperlem_data)
params = ["sigla", "file", "num", "lemma", "commento", "livello"]
csvwriter.writerow(params)
def write_lines(lines, sig, file):
for line in lines:
row = [sig, file]
lem = re.split('\|', line)
for l in lem:
# if "IPERLEMMA" in l:
# l = l.replace("IPERLEMMA", " ")
m = l.strip()
row.append(m)
# print(row)
csvwriter.writerow(row)
for x in range(len(lemmi)):
sigla = lemmi[x][0]
file_name = lemmi[x][1]
#Cambia percorso
f = open(baseDir + 'lemmi/' + file_name, "r", encoding='latin-1')
lines = f.readlines()
# print(lines)
# clean_lines = []
# for line in lines:
# if "IPERLEMMA" in line:
# clean_lines.append(line)
write_lines(lines, sigla, file_name)
f.close()
iperlem_data.close()
rutto = []
def write_lines_here(lines, sig):
toRet = []
for line in lines:
row = [sig]
lem = re.split('\|', line)
for l in lem:
# if "IPERLEMMA" in l:
# l = l.replace("IPERLEMMA", " ")
m = l.strip()
row.append(m)
# print(row)
toRet.append(row)
return toRet
for x in range(len(lemmi)):
sigla = lemmi[x][0]
file_name = lemmi[x][1]
#Cambia percorso
f = open(baseDir + 'lemmi/' + file_name, "r", encoding='latin-1')
lines = f.readlines()
# print(lines)
# clean_lines = []
# for line in lines:
# if "IPERLEMMA" in line:
# clean_lines.append(line)
rutto = rutto + write_lines_here(lines, sigla)
f.close()
len(rutto)
99660
rutto[0]
['c09', '1', 'balla', 's.f.', '']
preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], rutto))
prefinal = list(map(lambda row: (row[2], row[3], row[4]), preprefinal))
len(prefinal)
82090
final = list(set(prefinal))
len(final)
6989
final.sort()
lem_data_unique = open(baseDir + 'lem_unique.csv', 'w')
csvwriter = csv.writer(lem_data_unique)
csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])
for line in final:
csvwriter.writerow(line)
smalltree = ET.parse(baseDir + 'xmlgat/' + gat[2][1])
smallroot = smalltree.getroot()
azz = smallroot.iter('lem')
for node in azz:
print(node.attrib)
{'n': '4', 'type': '2'} {'n': '4', 'type': '2'} {'n': '11', 'type': '5'} {'n': '9', 'type': '4'} {'n': '9', 'type': '4'} {'n': '6', 'type': '3'} {'n': '12', 'type': '0'} {'n': '15', 'type': '0'} {'n': '13', 'type': '6'} {'n': '7', 'type': '0'} {'n': '8', 'type': '0'} {'n': '16', 'type': '1'} {'n': '10', 'type': '1'} {'n': '14', 'type': '1'} {'n': '1', 'type': '0'} {'n': '2', 'type': '0'} {'n': '5', 'type': '0'} {'n': '3', 'type': '0'} {'n': '4', 'type': '2'}
for node in smallroot.iter('lem'):
node.set('n', '100h')
print(node.attrib)
smalltree.write(baseDir + 'prova.xml')
{'n': '100h', 'type': '2'} {'n': '100h', 'type': '2'} {'n': '100h', 'type': '5'} {'n': '100h', 'type': '4'} {'n': '100h', 'type': '4'} {'n': '100h', 'type': '3'} {'n': '100h', 'type': '0'} {'n': '100h', 'type': '0'} {'n': '100h', 'type': '6'} {'n': '100h', 'type': '0'} {'n': '100h', 'type': '0'} {'n': '100h', 'type': '1'} {'n': '100h', 'type': '1'} {'n': '100h', 'type': '1'} {'n': '100h', 'type': '0'} {'n': '100h', 'type': '0'} {'n': '100h', 'type': '0'} {'n': '100h', 'type': '0'} {'n': '100h', 'type': '2'}