1 vuosi sitten · d4ccf0b318
--- a/OVI/Lemmi/lemmario_v4.py
+++ b/OVI/Lemmi/lemmario_v4.py
@@ -2,12 +2,14 @@
 
				 import xml.etree.ElementTree as ET
			
 
				 import os
			
 
				 import csv
			
 
				-from collections import OrderedDict, defaultdict
			
 
				+from collections import defaultdict
			
 
				 import re
			
 
				 import json
			
 
				 # %%
			
 
				 baseDir = '../../DATA/OVI/datiniXML/'
			
 
				 # %%
			
 
				+# PREREQUISITE
			
 
				+# Used to standardize lems in Gatto output xml files for easier parsing
			
 
				 def surroundLems(letterRoot):
			
 
				 
			
 
				     textRoot = list(letterRoot.iter('div'))[0]
			
@@ -34,36 +36,33 @@ def surroundLems(letterRoot):
 
				         node.tag = 'lem'
			
 
				     return textRoot
			
 
				 # %%
			
 
				-gat = {}
			
 
				-
			
 
				+# Extract lems from Gatto xml files
			
 
				+lemmiGatXml = {} # Output dict, storing lems from Gatto files; the keys are the OVI 'sigle'
			
 
				+#
			
 
				 basepath_gat = baseDir + 'xmlgat'
			
 
				-for entry in os.listdir(basepath_gat):
			
 
				+for entry in os.listdir(basepath_gat): # loop on all files in the basepath_gat directory
			
 
				     if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant
			
 
				         gg = entry.split('.')[1]
			
 
				         if gg != '':
			
 
				-            #gat.append([gg, entry])
			
 
				-            gat[gg]={"File": entry}
			
 
				-# %%
			
 
				-basepath_gat = baseDir + 'xmlgat/'
			
 
				-for code, value in gat.items(): 
			
 
				+            lemmiGatXml[gg]={"Filename": entry}
			
 
				+
			
 
				+for sigla, value in lemmiGatXml.items(): 
			
 
				     try:
			
 
				-        pluto = surroundLems(ET.parse(basepath_gat+value["File"]).getroot())
			
 
				+        pluto = surroundLems(ET.parse(basepath_gat+'/'+value["Filename"]).getroot())
			
 
				         value["lemmi"]=[]
			
 
				         for lem in pluto.iter('lem'):
			
 
				             lemRef = {'lemma': lem.text, 'num_lemma': lem.attrib['n'], 'num_iperlemma': lem.attrib['type']}
			
 
				             value["lemmi"].append(lemRef)
			
 
				     except: 
			
 
				-        print (code)
			
 
				-# %%
			
 
				-gat
			
 
				+        print('Error in parsing file:', sigla)
			
 
				 # %%
			
 
				-lemmi = []
			
 
				+lemmiGatTxt = []
			
 
				 basepath_lemmi = baseDir + 'lemmi_txt'
			
 
				 for entry in os.listdir(basepath_lemmi):
			
 
				     if os.path.isfile(os.path.join(basepath_lemmi, entry)):
			
 
				         ll = entry.split('.')[1]
			
 
				         if ll != '':
			
 
				-            lemmi.append([ll, entry])
			
 
				+            lemmiGatTxt.append([ll, entry])
			
 
				 # %%
			
 
				 xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')
			
 
				 root = xmlparse.getroot()
			
@@ -87,12 +86,12 @@ for sigla in sigle:
 
				     lemma = " "
			
 
				     gatto = " "
			
 
				     row.append(no_lemma)
			
 
				-    for x in range(len(lemmi)):
			
 
				-        if sigla == lemmi[x][0]:
			
 
				-            lemma = lemmi[x][1]
			
 
				+    for x in range(len(lemmiGatTxt)):
			
 
				+        if sigla == lemmiGatTxt[x][0]:
			
 
				+            lemma = lemmiGatTxt[x][1]
			
 
				     row.append(lemma)
			
 
				     try:
			
 
				-        gatto = gat[sigla]["File"]
			
 
				+        gatto = lemmiGatXml[sigla]["File"]
			
 
				         row.append(gatto)
			
 
				     except KeyError:
			
 
				         pass
			
@@ -115,7 +114,7 @@ def write_lines(lines, sig, file):
 
				             m = l.strip()
			
 
				             row.append(m)
			
 
				         try:
			
 
				-            gatLems = gat[sig]['lemmi']
			
 
				+            gatLems = lemmiGatXml[sig]['lemmi']
			
 
				             thisGatLem = next(filter(lambda el: el['num_lemma']==row[2], gatLems), '')
			
 
				             row.insert(4, thisGatLem['num_iperlemma'])
			
 
				         except:
			
@@ -123,9 +122,9 @@ def write_lines(lines, sig, file):
 
				         csvwriter.writerow(row)
			
 
				 
			
 
				 
			
 
				-for x in range(len(lemmi)):
			
 
				-    sigla = lemmi[x][0]
			
 
				-    file_name = lemmi[x][1]
			
 
				+for x in range(len(lemmiGatTxt)):
			
 
				+    sigla = lemmiGatTxt[x][0]
			
 
				+    file_name = lemmiGatTxt[x][1]
			
 
				     #Cambia percorso
			
 
				     f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
			
 
				     lines = f.readlines()
			
@@ -149,9 +148,9 @@ def write_lines_here(lines, sig):
 
				     return toRet
			
 
				 
			
 
				 
			
 
				-for x in range(len(lemmi)):
			
 
				-    sigla = lemmi[x][0]
			
 
				-    file_name = lemmi[x][1]
			
 
				+for x in range(len(lemmiGatTxt)):
			
 
				+    sigla = lemmiGatTxt[x][0]
			
 
				+    file_name = lemmiGatTxt[x][1]
			
 
				     #Cambia percorso
			
 
				     f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
			
 
				     lines = f.readlines()