kora 1 год назад
Родитель
Сommit
d4ccf0b318
1 измененных файлов с 25 добавлено и 26 удалено
  1. 25 26
      OVI/Lemmi/lemmario_v4.py

+ 25 - 26
OVI/Lemmi/lemmario_v4.py

@@ -2,12 +2,14 @@
 import xml.etree.ElementTree as ET
 import os
 import csv
-from collections import OrderedDict, defaultdict
+from collections import defaultdict
 import re
 import json
 # %%
 baseDir = '../../DATA/OVI/datiniXML/'
 # %%
+# PREREQUISITE
+# Used to standardize lems in Gatto output xml files for easier parsing
 def surroundLems(letterRoot):
 
     textRoot = list(letterRoot.iter('div'))[0]
@@ -34,36 +36,33 @@ def surroundLems(letterRoot):
         node.tag = 'lem'
     return textRoot
 # %%
-gat = {}
-
+# Extract lems from Gatto xml files
+lemmiGatXml = {} # Output dict, storing lems from Gatto files; the keys are the OVI 'sigle'
+#
 basepath_gat = baseDir + 'xmlgat'
-for entry in os.listdir(basepath_gat):
+for entry in os.listdir(basepath_gat): # loop on all files in the basepath_gat directory
     if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant
         gg = entry.split('.')[1]
         if gg != '':
-            #gat.append([gg, entry])
-            gat[gg]={"File": entry}
-# %%
-basepath_gat = baseDir + 'xmlgat/'
-for code, value in gat.items(): 
+            lemmiGatXml[gg]={"Filename": entry}
+
+for sigla, value in lemmiGatXml.items(): 
     try:
-        pluto = surroundLems(ET.parse(basepath_gat+value["File"]).getroot())
+        pluto = surroundLems(ET.parse(basepath_gat+'/'+value["Filename"]).getroot())
         value["lemmi"]=[]
         for lem in pluto.iter('lem'):
             lemRef = {'lemma': lem.text, 'num_lemma': lem.attrib['n'], 'num_iperlemma': lem.attrib['type']}
             value["lemmi"].append(lemRef)
     except: 
-        print (code)
-# %%
-gat
+        print('Error in parsing file:', sigla)
 # %%
-lemmi = []
+lemmiGatTxt = []
 basepath_lemmi = baseDir + 'lemmi_txt'
 for entry in os.listdir(basepath_lemmi):
     if os.path.isfile(os.path.join(basepath_lemmi, entry)):
         ll = entry.split('.')[1]
         if ll != '':
-            lemmi.append([ll, entry])
+            lemmiGatTxt.append([ll, entry])
 # %%
 xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')
 root = xmlparse.getroot()
@@ -87,12 +86,12 @@ for sigla in sigle:
     lemma = " "
     gatto = " "
     row.append(no_lemma)
-    for x in range(len(lemmi)):
-        if sigla == lemmi[x][0]:
-            lemma = lemmi[x][1]
+    for x in range(len(lemmiGatTxt)):
+        if sigla == lemmiGatTxt[x][0]:
+            lemma = lemmiGatTxt[x][1]
     row.append(lemma)
     try:
-        gatto = gat[sigla]["File"]
+        gatto = lemmiGatXml[sigla]["File"]
         row.append(gatto)
     except KeyError:
         pass
@@ -115,7 +114,7 @@ def write_lines(lines, sig, file):
             m = l.strip()
             row.append(m)
         try:
-            gatLems = gat[sig]['lemmi']
+            gatLems = lemmiGatXml[sig]['lemmi']
             thisGatLem = next(filter(lambda el: el['num_lemma']==row[2], gatLems), '')
             row.insert(4, thisGatLem['num_iperlemma'])
         except:
@@ -123,9 +122,9 @@ def write_lines(lines, sig, file):
         csvwriter.writerow(row)
 
 
-for x in range(len(lemmi)):
-    sigla = lemmi[x][0]
-    file_name = lemmi[x][1]
+for x in range(len(lemmiGatTxt)):
+    sigla = lemmiGatTxt[x][0]
+    file_name = lemmiGatTxt[x][1]
     #Cambia percorso
     f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
     lines = f.readlines()
@@ -149,9 +148,9 @@ def write_lines_here(lines, sig):
     return toRet
 
 
-for x in range(len(lemmi)):
-    sigla = lemmi[x][0]
-    file_name = lemmi[x][1]
+for x in range(len(lemmiGatTxt)):
+    sigla = lemmiGatTxt[x][0]
+    file_name = lemmiGatTxt[x][1]
     #Cambia percorso
     f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
     lines = f.readlines()