import xml.etree.ElementTree as ET
import os
import csv
from collections import OrderedDict
baseDir = '/home/kora/Desktop/OVI_Data_local/Dati nuOVI (fine Giugno 21)/datiniXML/'
gat = []
basepath_gat = baseDir + 'xmlgat'
for entry in os.listdir(basepath_gat):
if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant
gg = entry.split('.')[1]
if gg != '':
gat.append([gg, entry])
# NON HO AL MOMENTO A DISPOSIZIONE I 'nolemmi'
#nolemmi = []
#basepath_nolemmi = baseDir + 'DatiniXML_incompleto'
#for entry in os.listdir(basepath_nolemmi):
# if os.path.isfile(os.path.join(basepath_nolemmi, entry)):
# nn = entry.split('.')[1]
# if nn != '':
# nolemmi.append([nn, entry])
lemmi = []
basepath_lemmi = baseDir + 'lemmi'
for entry in os.listdir(basepath_lemmi):
if os.path.isfile(os.path.join(basepath_lemmi, entry)):
ll = entry.split('.')[1]
if ll != '':
lemmi.append([ll, entry])
xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')
root = xmlparse.getroot()
biblio = root.findall("Biblio")
sigle = []
for bib in biblio:
sigla = bib.find("sigla")
sigle.append(sigla.text.lower())
OVI_data = open(baseDir + 'OVI_Data.csv', 'w')
csvwriter = csv.writer(OVI_data)
params = ["BiblioDatini", "nolemmi", "lemmi", "xmlgat"]
csvwriter.writerow(params)
for sigla in sigle:
row = [sigla]
no_lemma = " "
lemma = " "
gatto = " "
for x in range(len(nolemmi)):
if sigla in nolemmi[x][0]:
no_lemma = nolemmi[x][1]
row.append(no_lemma)
for x in range(len(lemmi)):
if sigla == lemmi[x][0]:
lemma = lemmi[x][1]
row.append(lemma)
for x in range(len(gat)):
if sigla == gat[x][0]:
gatto = gat[x][1]
row.append(gatto)
csvwriter.writerow(row)
OVI_data.close()
#Cambiare percorsi file
#xml_file_name = 'Ovi/BiblioDatini.xml'
#tree = Xet.parse(xml_file_name)
#root = tree.getroot() --> già definito
#biblio = root.findall("Biblio") --> già definito
Datini_data = open(baseDir + 'Datini_Data.csv', 'w')
csvwriter = csv.writer(Datini_data)
elemList = []
for elem in root.iter():
elemList.append(elem.tag)
elemList = list(set(elemList))
elemList.remove("dataroot")
elemList.remove("Biblio")
param = elemList
csvwriter.writerow(param)
def cell(p, arr):
if arr.find(p) is None:
res = " "
else:
res = arr.find(p).text
return res
for scheda in biblio:
aut = []
for par in param:
if par == "star_note":
if scheda.find(".//star_note") is None:
r = " "
else:
r = "True"
else:
r = cell(par, scheda)
aut.append(r)
csvwriter.writerow(aut)
Datini_data.close()
import re
iperlem_data = open(baseDir + 'lem_Data.csv', 'w')
csvwriter = csv.writer(iperlem_data)
params = ["sigla", "file", "num", "lemma", "commento", "livello"]
csvwriter.writerow(params)
def write_lines(lines, sig, file):
for line in lines:
row = [sig, file]
lem = re.split('\|', line)
for l in lem:
m = l.strip()
row.append(m)
# print(row)
csvwriter.writerow(row)
for x in range(len(lemmi)):
sigla = lemmi[x][0]
file_name = lemmi[x][1]
#Cambia percorso
f = open(baseDir + 'lemmi/' + file_name, "r", encoding='latin-1')
lines = f.readlines()
write_lines(lines, sigla, file_name)
f.close()
iperlem_data.close()
redundantLemmas = []
def write_lines_here(lines, sig):
toRet = []
for line in lines:
row = [sig]
lem = re.split('\|', line)
for l in lem:
m = l.strip()
row.append(m)
toRet.append(row)
return toRet
for x in range(len(lemmi)):
sigla = lemmi[x][0]
file_name = lemmi[x][1]
#Cambia percorso
f = open(baseDir + 'lemmi/' + file_name, "r", encoding='latin-1')
lines = f.readlines()
redundantLemmas = redundantLemmas + write_lines_here(lines, sigla)
f.close()
print(len(redundantLemmas))
110829
redundantLemmas[0]
['b60', '1', 'denaro', 's.m.', '']
preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas))
prefinal = list(map(lambda row: (row[2], row[3], row[4]), preprefinal))
print(len(prefinal))
final = list(set(prefinal))
final.sort()
print(len(final))
# IPERLEMMI
preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas))
prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER))
finalIPER = list(set(prefinalIPER))
finalIPER.sort()
90150 7591
lem_data_unique = open(baseDir + 'lem_unique.csv', 'w')
csvwriter = csv.writer(lem_data_unique)
csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])
for line in final:
csvwriter.writerow(line)
lem_data_unique.close()
# IPERLEMMI
iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w')
csvwriter = csv.writer(iperlem_data_unique)
csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note'])
for line in finalIPER:
#print(line)
csvwriter.writerow(line)
iperlem_data_unique.close()
cat_gramm = set(map(lambda entry: entry[1], final))
cat_gramm2 = list(cat_gramm)
cat_gramm2.sort()
print(cat_gramm2)
byType = OrderedDict()
for type1 in cat_gramm2:
byType[type1] = list(filter(lambda a: a[1]==type1, final))
lem_data_byCat = open(baseDir + 'lem_unique_byCat.csv', 'w')
csvwriter = csv.writer(lem_data_byCat)
csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])
for type1 in cat_gramm2:
print(type1)
for line in byType[type1]:
csvwriter.writerow(line)
lem_data_byCat.close()
ii = 2
smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])
smallroot = smalltree.getroot()
localLemNodes = smallroot.iter('lem')
for node in localLemNodes:
print(node.attrib)
print(node.tag)
#
thisLemma = list(filter(lambda lem: lem[0]==gat[ii][0] and lem[1]==node.attrib['n'], redundantLemmas))[0]
newID = final.index((thisLemma[2], thisLemma[3], thisLemma[4]))
#
print('Lemma: ', thisLemma)
print('New ID: ', newID)
print()
{'n': '6', 'type': '1'} lem Lemma: ['d16', '6', 'maggio', 's.m.', ''] New ID: 4247 {'n': '13', 'type': '0'} lem Lemma: ['d16', '13', 'simona (donna di piero di paolo rinaldeschi)', 'antr.', ''] New ID: 6666 {'n': '7', 'type': '0'} lem Lemma: ['d16', '7', 'famiglio', 's.m.', ''] New ID: 2514 {'n': '18', 'type': '2'} lem Lemma: ['d16', '18', 'oncia', 's.f.', ''] New ID: 5069 {'n': '20', 'type': '4'} lem Lemma: ['d16', '20', 'bottoncino', 's.m.', ''] New ID: 1255 {'n': '22', 'type': '0'} lem Lemma: ['d16', '22', 'infilare', 'v.', ''] New ID: 3786 {'n': '9', 'type': '2'} lem Lemma: ['d16', '9', 'braccio', 's.m.', ''] New ID: 1269 {'n': '19', 'type': '4'} lem Lemma: ['d16', '19', 'frangia', 's.f.', ''] New ID: 2907 {'n': '8', 'type': '3'} lem Lemma: ['d16', '8', 'nero', 'agg./s.m.', ''] New ID: 4860 {'n': '15', 'type': '3'} lem Lemma: ['d16', '15', 'azzurro', 'agg./s.m.', ''] New ID: 709 {'n': '16', 'type': '0'} lem Lemma: ['d16', '16', 'nannino (manovale)', 'antr.', ''] New ID: 4807 {'n': '12', 'type': '0'} lem Lemma: ['d16', '12', 'san bartolomeo (chiesa e convento di)', 'n.g.', 'a prato (convento di carmelitani)'] New ID: 6303 {'n': '14', 'type': '0'} lem Lemma: ['d16', '14', 'nanni di luca da santa chiara', 'antr.', ''] New ID: 4789 {'n': '11', 'type': '0'} lem Lemma: ['d16', '11', 'garzone', 's.m.', ''] New ID: 3021 {'n': '21', 'type': '1'} lem Lemma: ['d16', '21', 'nona', 's.f.', ''] New ID: 5025 {'n': '10', 'type': '0'} lem Lemma: ['d16', '10', 'domenica', 's.f.', ''] New ID: 2369 {'n': '3', 'type': '0'} lem Lemma: ['d16', '3', 'sere', 's.m.', ''] New ID: 6611 {'n': '17', 'type': '0'} lem Lemma: ['d16', '17', 'clemente di niccolò di piero', 'antr.', ''] New ID: 1900 {'n': '14', 'type': '0'} lem Lemma: ['d16', '14', 'nanni di luca da santa chiara', 'antr.', ''] New ID: 4789 {'n': '1', 'type': '0'} lem Lemma: ['d16', '1', 'margherita di domenico bandini', 'antr.', ''] New ID: 4378 {'n': '5', 'type': '0'} lem Lemma: ['d16', '5', 'prato', 'n.g.', ''] New ID: 5821 {'n': '4', 'type': '0'} lem Lemma: ['d16', '4', 'francesco di marco datini', 'antr.', ''] New ID: 2864 {'n': '5', 'type': '0'} lem Lemma: ['d16', '5', 'prato', 'n.g.', ''] New ID: 5821 {'n': '2', 'type': '0'} lem Lemma: ['d16', '2', 'firenze', 'n.g.', ''] New ID: 2708 {'n': '5', 'type': '0'} lem Lemma: ['d16', '5', 'prato', 'n.g.', ''] New ID: 5821 {'n': '6', 'type': '1'} lem Lemma: ['d16', '6', 'maggio', 's.m.', ''] New ID: 4247
ii = 2
smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])
smallroot = smalltree.getroot()
for node in smallroot.iter('lem'):
node.set('n', '100h')
smalltree.write(baseDir + 'prova.xml')
for ii in range(len(gat)):
try:
smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])
smallroot = smalltree.getroot()
#
localLemNodes = smallroot.iter('lem')
#
for node in localLemNodes:
try:
thisLemma = list(filter(lambda lem: lem[0]==gat[ii][0] and lem[1]==node.attrib['n'], redundantLemmas))[0]
newID = final.index((thisLemma[2], thisLemma[3], thisLemma[4]))
#
node.set('n', str(newID))
except Exception as e:
print('In looking for lemma:')
print(e)
try:
smalltree.write(baseDir + 'newxmlgat/'+'newxmlgat.'+gat[ii][0]+'.xml')
except Exception as e:
print('In Export:')
print(e)
except:
print('Parsing original xml file: ', gat[ii][1], ' failed' )
Parsing original xml file: xmlgat.k01.txt failed Parsing original xml file: xmlgat.j99.txt failed Parsing original xml file: xmlgat.c13.txt failed