TEAMOVI/Parser: Parsers: from raw data to stores @ 4d66d8b41d86ed49d45f36724fb2ab1bffafded7

Lemmario_v2.ipynb18 KB
					
					History
					Raw
				
import xml.etree.ElementTree as ET
import os
import csv
from collections import OrderedDict
baseDir = '/home/kora/Desktop/OVI_Data_local/Dati nuOVI (fine Giugno 21)/datiniXML/'
gat = []
basepath_gat = baseDir + 'xmlgat'
for entry in os.listdir(basepath_gat):
    if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant
        gg = entry.split('.')[1]
        if gg != '':
            gat.append([gg, entry])


# NON HO AL MOMENTO A DISPOSIZIONE I 'nolemmi'
            
#nolemmi = []
#basepath_nolemmi = baseDir + 'DatiniXML_incompleto'
#for entry in os.listdir(basepath_nolemmi):
#    if os.path.isfile(os.path.join(basepath_nolemmi, entry)):
#        nn = entry.split('.')[1]
#        if nn != '':
#            nolemmi.append([nn, entry])

lemmi = []
basepath_lemmi = baseDir + 'lemmi'
for entry in os.listdir(basepath_lemmi):
    if os.path.isfile(os.path.join(basepath_lemmi, entry)):
        ll = entry.split('.')[1]
        if ll != '':
            lemmi.append([ll, entry])
Non mi è chiaro a cosa servano le **tre** celle seguenti -- in ogni caso, non ho a disposizione BiblioDatini.xml nella versione corrente dei dati OVI.
xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')
root = xmlparse.getroot()
biblio = root.findall("Biblio")

sigle = []
for bib in biblio:
    sigla = bib.find("sigla")
    sigle.append(sigla.text.lower())
OVI_data = open(baseDir + 'OVI_Data.csv', 'w')
csvwriter = csv.writer(OVI_data)

params = ["BiblioDatini", "nolemmi", "lemmi", "xmlgat"]

csvwriter.writerow(params)

for sigla in sigle:
    row = [sigla]
    no_lemma = " "
    lemma = " "
    gatto = " "
    for x in range(len(nolemmi)):
        if sigla in nolemmi[x][0]:
            no_lemma = nolemmi[x][1]
    row.append(no_lemma)
    for x in range(len(lemmi)):
        if sigla == lemmi[x][0]:
            lemma = lemmi[x][1]
    row.append(lemma)
    for x in range(len(gat)):
        if sigla == gat[x][0]:
            gatto = gat[x][1]
    row.append(gatto)
    csvwriter.writerow(row)

OVI_data.close()
#Cambiare percorsi file
#xml_file_name = 'Ovi/BiblioDatini.xml'
#tree = Xet.parse(xml_file_name)
#root = tree.getroot()           --> già definito
#biblio = root.findall("Biblio") --> già definito


Datini_data = open(baseDir + 'Datini_Data.csv', 'w')
csvwriter = csv.writer(Datini_data)

elemList = []

for elem in root.iter():
    elemList.append(elem.tag)


elemList = list(set(elemList))

elemList.remove("dataroot")
elemList.remove("Biblio")

param = elemList

csvwriter.writerow(param)


def cell(p, arr):
    if arr.find(p) is None:
        res = " "
    else:
        res = arr.find(p).text
    return res


for scheda in biblio:
    aut = []

    for par in param:
        if par == "star_note":
            if scheda.find(".//star_note") is None:
                r = " "
            else:
                r = "True"
        else:
            r = cell(par, scheda)
        aut.append(r)

    csvwriter.writerow(aut)

Datini_data.close()
Si riprende da qui
import re


iperlem_data = open(baseDir + 'lem_Data.csv', 'w')
csvwriter = csv.writer(iperlem_data)

params = ["sigla", "file", "num", "lemma", "commento", "livello"]
csvwriter.writerow(params)


def write_lines(lines, sig, file):
    for line in lines:
        row = [sig, file]
        lem = re.split('\|', line)
        for l in lem:
            m = l.strip()
            row.append(m)
#        print(row)
        csvwriter.writerow(row)


for x in range(len(lemmi)):
    sigla = lemmi[x][0]
    file_name = lemmi[x][1]
    #Cambia percorso
    f = open(baseDir + 'lemmi/' + file_name, "r", encoding='latin-1')
    lines = f.readlines()
    write_lines(lines, sigla, file_name)


f.close()

iperlem_data.close()
redundantLemmas = []

def write_lines_here(lines, sig):
    toRet = []
    for line in lines:
        row = [sig]
        lem = re.split('\|', line)
        for l in lem:
            m = l.strip()
            row.append(m)
        toRet.append(row)
    return toRet


for x in range(len(lemmi)):
    sigla = lemmi[x][0]
    file_name = lemmi[x][1]
    #Cambia percorso
    f = open(baseDir + 'lemmi/' + file_name, "r", encoding='latin-1')
    lines = f.readlines()
    redundantLemmas = redundantLemmas + write_lines_here(lines, sigla)


f.close()

print(len(redundantLemmas))
110829
Check di cosa viene fuori dalla lettura
redundantLemmas[0]
['b60', '1', 'denaro', 's.m.', '']
Esporto il lemmario
preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas))
prefinal = list(map(lambda row: (row[2], row[3], row[4]), preprefinal))

print(len(prefinal))

final = list(set(prefinal))
final.sort()

print(len(final))


# IPERLEMMI

preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas))
prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER))

finalIPER = list(set(prefinalIPER))
finalIPER.sort()
90150
7591
lem_data_unique = open(baseDir + 'lem_unique.csv', 'w')
csvwriter = csv.writer(lem_data_unique)

csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])

for line in final:
    csvwriter.writerow(line)

lem_data_unique.close()


# IPERLEMMI

iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w')
csvwriter = csv.writer(iperlem_data_unique)

csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note'])

for line in finalIPER:
    #print(line)
    csvwriter.writerow(line)

iperlem_data_unique.close()
Categorie grammaticali ed export ordinato
cat_gramm = set(map(lambda entry: entry[1], final))
cat_gramm2 = list(cat_gramm)
cat_gramm2.sort()

print(cat_gramm2)
byType = OrderedDict()

for type1 in cat_gramm2:
    byType[type1] = list(filter(lambda a: a[1]==type1, final))


lem_data_byCat = open(baseDir + 'lem_unique_byCat.csv', 'w')
csvwriter = csv.writer(lem_data_byCat)

csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])

for type1 in cat_gramm2:
    print(type1)
    for line in byType[type1]:
        csvwriter.writerow(line)

lem_data_byCat.close()
Microprova di modifica + export di xml (modifico un attribute di un tag).

Come prima cosa, provo a recuperare la lista dei lemmi di un singolo file, e a rintracciare quel lemma e il suo ID (numero d'ordine) nel lemmario.
ii = 2

smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])
smallroot = smalltree.getroot()

localLemNodes = smallroot.iter('lem')

for node in localLemNodes:
    print(node.attrib)
    print(node.tag)
    #
    thisLemma = list(filter(lambda lem: lem[0]==gat[ii][0] and lem[1]==node.attrib['n'], redundantLemmas))[0]
    newID = final.index((thisLemma[2], thisLemma[3], thisLemma[4]))
    #
    print('Lemma: ', thisLemma)
    print('New ID: ', newID)
    print()
{'n': '6', 'type': '1'}
lem
Lemma:  ['d16', '6', 'maggio', 's.m.', '']
New ID:  4247

{'n': '13', 'type': '0'}
lem
Lemma:  ['d16', '13', 'simona (donna di piero di paolo rinaldeschi)', 'antr.', '']
New ID:  6666

{'n': '7', 'type': '0'}
lem
Lemma:  ['d16', '7', 'famiglio', 's.m.', '']
New ID:  2514

{'n': '18', 'type': '2'}
lem
Lemma:  ['d16', '18', 'oncia', 's.f.', '']
New ID:  5069

{'n': '20', 'type': '4'}
lem
Lemma:  ['d16', '20', 'bottoncino', 's.m.', '']
New ID:  1255

{'n': '22', 'type': '0'}
lem
Lemma:  ['d16', '22', 'infilare', 'v.', '']
New ID:  3786

{'n': '9', 'type': '2'}
lem
Lemma:  ['d16', '9', 'braccio', 's.m.', '']
New ID:  1269

{'n': '19', 'type': '4'}
lem
Lemma:  ['d16', '19', 'frangia', 's.f.', '']
New ID:  2907

{'n': '8', 'type': '3'}
lem
Lemma:  ['d16', '8', 'nero', 'agg./s.m.', '']
New ID:  4860

{'n': '15', 'type': '3'}
lem
Lemma:  ['d16', '15', 'azzurro', 'agg./s.m.', '']
New ID:  709

{'n': '16', 'type': '0'}
lem
Lemma:  ['d16', '16', 'nannino (manovale)', 'antr.', '']
New ID:  4807

{'n': '12', 'type': '0'}
lem
Lemma:  ['d16', '12', 'san bartolomeo (chiesa e convento di)', 'n.g.', 'a prato (convento di carmelitani)']
New ID:  6303

{'n': '14', 'type': '0'}
lem
Lemma:  ['d16', '14', 'nanni di luca da santa chiara', 'antr.', '']
New ID:  4789

{'n': '11', 'type': '0'}
lem
Lemma:  ['d16', '11', 'garzone', 's.m.', '']
New ID:  3021

{'n': '21', 'type': '1'}
lem
Lemma:  ['d16', '21', 'nona', 's.f.', '']
New ID:  5025

{'n': '10', 'type': '0'}
lem
Lemma:  ['d16', '10', 'domenica', 's.f.', '']
New ID:  2369

{'n': '3', 'type': '0'}
lem
Lemma:  ['d16', '3', 'sere', 's.m.', '']
New ID:  6611

{'n': '17', 'type': '0'}
lem
Lemma:  ['d16', '17', 'clemente di niccolò di piero', 'antr.', '']
New ID:  1900

{'n': '14', 'type': '0'}
lem
Lemma:  ['d16', '14', 'nanni di luca da santa chiara', 'antr.', '']
New ID:  4789

{'n': '1', 'type': '0'}
lem
Lemma:  ['d16', '1', 'margherita di domenico bandini', 'antr.', '']
New ID:  4378

{'n': '5', 'type': '0'}
lem
Lemma:  ['d16', '5', 'prato', 'n.g.', '']
New ID:  5821

{'n': '4', 'type': '0'}
lem
Lemma:  ['d16', '4', 'francesco di marco datini', 'antr.', '']
New ID:  2864

{'n': '5', 'type': '0'}
lem
Lemma:  ['d16', '5', 'prato', 'n.g.', '']
New ID:  5821

{'n': '2', 'type': '0'}
lem
Lemma:  ['d16', '2', 'firenze', 'n.g.', '']
New ID:  2708

{'n': '5', 'type': '0'}
lem
Lemma:  ['d16', '5', 'prato', 'n.g.', '']
New ID:  5821

{'n': '6', 'type': '1'}
lem
Lemma:  ['d16', '6', 'maggio', 's.m.', '']
New ID:  4247

Qui faccio una prova di modifica del singolo file e di export in nuovo xml
ii = 2

smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])
smallroot = smalltree.getroot()

for node in smallroot.iter('lem'):
    node.set('n', '100h')
        
smalltree.write(baseDir + 'prova.xml')
Infine provo un loop su tutti i file, li elaboro e li esporto in una nuova cartella (che va creata a mano) 'newxmlgat'
for ii in range(len(gat)):
    try:
        smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])
        smallroot = smalltree.getroot()
        #
        localLemNodes = smallroot.iter('lem')
        #
        for node in localLemNodes:
            try:
                thisLemma = list(filter(lambda lem: lem[0]==gat[ii][0] and lem[1]==node.attrib['n'], redundantLemmas))[0]
                newID = final.index((thisLemma[2], thisLemma[3], thisLemma[4]))
                #
                node.set('n', str(newID))
            except Exception as e:
                print('In looking for lemma:')
                print(e)
        try:
            smalltree.write(baseDir + 'newxmlgat/'+'newxmlgat.'+gat[ii][0]+'.xml')
        except Exception as e:
            print('In Export:')
            print(e)
    except:
        print('Parsing original xml file: ', gat[ii][1], ' failed' )
Parsing original xml file:  xmlgat.k01.txt  failed
Parsing original xml file:  xmlgat.j99.txt  failed
Parsing original xml file:  xmlgat.c13.txt  failed
Lemmario_v2.ipynb 18 KB History Raw

Lemmario_v2.ipynb 18 KB

History Raw