|
@@ -2,12 +2,14 @@
|
|
|
import xml.etree.ElementTree as ET
|
|
|
import os
|
|
|
import csv
|
|
|
-from collections import OrderedDict, defaultdict
|
|
|
+from collections import defaultdict
|
|
|
import re
|
|
|
import json
|
|
|
# %%
|
|
|
baseDir = '../../DATA/OVI/datiniXML/'
|
|
|
# %%
|
|
|
+# PREREQUISITE
|
|
|
+# Used to standardize lems in Gatto output xml files for easier parsing
|
|
|
def surroundLems(letterRoot):
|
|
|
|
|
|
textRoot = list(letterRoot.iter('div'))[0]
|
|
@@ -34,36 +36,33 @@ def surroundLems(letterRoot):
|
|
|
node.tag = 'lem'
|
|
|
return textRoot
|
|
|
# %%
|
|
|
-gat = {}
|
|
|
-
|
|
|
+# Extract lems from Gatto xml files
|
|
|
+lemmiGatXml = {} # Output dict, storing lems from Gatto files; the keys are the OVI 'sigle'
|
|
|
+#
|
|
|
basepath_gat = baseDir + 'xmlgat'
|
|
|
-for entry in os.listdir(basepath_gat):
|
|
|
+for entry in os.listdir(basepath_gat): # loop on all files in the basepath_gat directory
|
|
|
if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant
|
|
|
gg = entry.split('.')[1]
|
|
|
if gg != '':
|
|
|
- #gat.append([gg, entry])
|
|
|
- gat[gg]={"File": entry}
|
|
|
-# %%
|
|
|
-basepath_gat = baseDir + 'xmlgat/'
|
|
|
-for code, value in gat.items():
|
|
|
+ lemmiGatXml[gg]={"Filename": entry}
|
|
|
+
|
|
|
+for sigla, value in lemmiGatXml.items():
|
|
|
try:
|
|
|
- pluto = surroundLems(ET.parse(basepath_gat+value["File"]).getroot())
|
|
|
+ pluto = surroundLems(ET.parse(basepath_gat+'/'+value["Filename"]).getroot())
|
|
|
value["lemmi"]=[]
|
|
|
for lem in pluto.iter('lem'):
|
|
|
lemRef = {'lemma': lem.text, 'num_lemma': lem.attrib['n'], 'num_iperlemma': lem.attrib['type']}
|
|
|
value["lemmi"].append(lemRef)
|
|
|
except:
|
|
|
- print (code)
|
|
|
-# %%
|
|
|
-gat
|
|
|
+ print('Error in parsing file:', sigla)
|
|
|
# %%
|
|
|
-lemmi = []
|
|
|
+lemmiGatTxt = []
|
|
|
basepath_lemmi = baseDir + 'lemmi_txt'
|
|
|
for entry in os.listdir(basepath_lemmi):
|
|
|
if os.path.isfile(os.path.join(basepath_lemmi, entry)):
|
|
|
ll = entry.split('.')[1]
|
|
|
if ll != '':
|
|
|
- lemmi.append([ll, entry])
|
|
|
+ lemmiGatTxt.append([ll, entry])
|
|
|
# %%
|
|
|
xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')
|
|
|
root = xmlparse.getroot()
|
|
@@ -87,12 +86,12 @@ for sigla in sigle:
|
|
|
lemma = " "
|
|
|
gatto = " "
|
|
|
row.append(no_lemma)
|
|
|
- for x in range(len(lemmi)):
|
|
|
- if sigla == lemmi[x][0]:
|
|
|
- lemma = lemmi[x][1]
|
|
|
+ for x in range(len(lemmiGatTxt)):
|
|
|
+ if sigla == lemmiGatTxt[x][0]:
|
|
|
+ lemma = lemmiGatTxt[x][1]
|
|
|
row.append(lemma)
|
|
|
try:
|
|
|
- gatto = gat[sigla]["File"]
|
|
|
+ gatto = lemmiGatXml[sigla]["File"]
|
|
|
row.append(gatto)
|
|
|
except KeyError:
|
|
|
pass
|
|
@@ -115,7 +114,7 @@ def write_lines(lines, sig, file):
|
|
|
m = l.strip()
|
|
|
row.append(m)
|
|
|
try:
|
|
|
- gatLems = gat[sig]['lemmi']
|
|
|
+ gatLems = lemmiGatXml[sig]['lemmi']
|
|
|
thisGatLem = next(filter(lambda el: el['num_lemma']==row[2], gatLems), '')
|
|
|
row.insert(4, thisGatLem['num_iperlemma'])
|
|
|
except:
|
|
@@ -123,9 +122,9 @@ def write_lines(lines, sig, file):
|
|
|
csvwriter.writerow(row)
|
|
|
|
|
|
|
|
|
-for x in range(len(lemmi)):
|
|
|
- sigla = lemmi[x][0]
|
|
|
- file_name = lemmi[x][1]
|
|
|
+for x in range(len(lemmiGatTxt)):
|
|
|
+ sigla = lemmiGatTxt[x][0]
|
|
|
+ file_name = lemmiGatTxt[x][1]
|
|
|
#Cambia percorso
|
|
|
f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
|
|
|
lines = f.readlines()
|
|
@@ -149,9 +148,9 @@ def write_lines_here(lines, sig):
|
|
|
return toRet
|
|
|
|
|
|
|
|
|
-for x in range(len(lemmi)):
|
|
|
- sigla = lemmi[x][0]
|
|
|
- file_name = lemmi[x][1]
|
|
|
+for x in range(len(lemmiGatTxt)):
|
|
|
+ sigla = lemmiGatTxt[x][0]
|
|
|
+ file_name = lemmiGatTxt[x][1]
|
|
|
#Cambia percorso
|
|
|
f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
|
|
|
lines = f.readlines()
|