Browse Source

add Parser dati OVI to CSV

Alessia 2 years ago
parent
commit
12fa216839
5 changed files with 291 additions and 0 deletions
  1. 63 0
      OVI/BiblioDatinitoCSV.py
  2. 50 0
      OVI/EstraiIperlemmi.py
  3. 50 0
      OVI/EstraiLemmi.py
  4. 56 0
      OVI/Lemmi-Iperlemmi.py
  5. 72 0
      OVI/confrontoDatiOVI.py

+ 63 - 0
OVI/BiblioDatinitoCSV.py

@@ -0,0 +1,63 @@
+import xml.etree.ElementTree as Xet
+from typing import Dict, Any
+import pandas as pd
+import os
+import csv
+from xml.dom import minidom
+import sys
+import re
+
+xml_file_name = '/Users/alessiaspadi/Documents/RESTORE/temp_ovi/BiblioDatini.xml'
+tree = Xet.parse(xml_file_name)
+root = tree.getroot()
+biblio = root.findall("Biblio")
+
+
+
+Datini_data = open('Datini_Data.csv', 'w')
+csvwriter = csv.writer(Datini_data)
+
+elemList = []
+
+xmlTree = Xet.parse(xml_file_name)
+
+for elem in root.iter():
+    elemList.append(elem.tag)
+
+
+elemList = list(set(elemList))
+
+elemList.remove("dataroot")
+elemList.remove("Biblio")
+
+param = elemList
+
+csvwriter.writerow(param)
+
+#param = ["recno", "titolo", "descrizione", "segnatura", "sigla"]
+
+
+def cell(p, arr):
+    if arr.find(p) is None:
+        res = " "
+    else:
+        res = arr.find(p).text
+    return res
+
+
+for scheda in biblio:
+    aut = []
+
+    for par in param:
+        if par == "star_note":
+            if scheda.find(".//star_note") is None:
+                r = " "
+            else:
+                r = "True"
+        else:
+            r = cell(par, scheda)
+        aut.append(r)
+
+    csvwriter.writerow(aut)
+
+Datini_data.close()

+ 50 - 0
OVI/EstraiIperlemmi.py

@@ -0,0 +1,50 @@
+import re
+import csv
+import os
+
+lemmi = []
+basepath_lemmi = '/Users/alessiaspadi/Documents/RESTORE/temp_ovi/lemmi'
+for entry in os.listdir(basepath_lemmi):
+    if os.path.isfile(os.path.join(basepath_lemmi, entry)):
+        ll = entry.split('.')[1].lstrip().split(' ')[0]
+        if ll != '':
+            lemmi.append([ll, entry])
+
+
+iperlem_data = open('iperlem_Data.csv', 'w')
+csvwriter = csv.writer(iperlem_data)
+
+params = ["sigla", "file", "num", "iperlemma", "commento", "livello"]
+csvwriter.writerow(params)
+
+
+def write_lines(lines, sig, file):
+    for line in lines:
+        row = [sig, file]
+        lem = re.split('\|', line)
+        for l in lem:
+            if "IPERLEMMA" in l:
+                ll = l.replace("IPERLEMMA", " ")
+            else:
+                ll = l
+            m = ll.rstrip()
+            n = m.lstrip()
+            row.append(n)
+        csvwriter.writerow(row)
+
+
+for x in range(len(lemmi)):
+    sigla = lemmi[x][0]
+    file_name = lemmi[x][1]
+    f = open('/Users/alessiaspadi/Documents/RESTORE/temp_ovi/lemmi/' + file_name, "r", encoding='latin-1')
+    lines = f.readlines()
+    clean_lines = []
+    for line in lines:
+        if "IPERLEMMA" in line:
+            clean_lines.append(line)
+    write_lines(clean_lines, sigla, file_name)
+
+
+f.close()
+
+iperlem_data.close()

+ 50 - 0
OVI/EstraiLemmi.py

@@ -0,0 +1,50 @@
+import re
+import csv
+import os
+
+lemmi = []
+basepath_lemmi = '/Users/alessiaspadi/Documents/RESTORE/temp_ovi/lemmi'
+for entry in os.listdir(basepath_lemmi):
+    if os.path.isfile(os.path.join(basepath_lemmi, entry)):
+        ll = entry.split('.')[1].lstrip().split(' ')[0]
+        if ll != '':
+            lemmi.append([ll, entry])
+
+
+iperlem_data = open('iperlem_Data.csv', 'w')
+csvwriter = csv.writer(iperlem_data)
+
+params = ["sigla", "file", "num", "iperlemma", "commento", "livello"]
+csvwriter.writerow(params)
+
+
+def write_lines(lines, sig, file):
+    for line in lines:
+        row = [sig, file]
+        lem = re.split('\|', line)
+        for l in lem:
+            if "IPERLEMMA" in l:
+                ll = l.replace("IPERLEMMA", " ")
+            else:
+                ll = l
+            m = ll.rstrip()
+            n = m.lstrip()
+            row.append(n)
+        csvwriter.writerow(row)
+
+
+for x in range(len(lemmi)):
+    sigla = lemmi[x][0]
+    file_name = lemmi[x][1]
+    f = open('/Users/alessiaspadi/Documents/RESTORE/temp_ovi/lemmi/' + file_name, "r", encoding='latin-1')
+    lines = f.readlines()
+    clean_lines = []
+    for line in lines:
+        if "IPERLEMMA" in line:
+            clean_lines.append(line)
+    write_lines(clean_lines, sigla, file_name)
+
+
+f.close()
+
+iperlem_data.close()

+ 56 - 0
OVI/Lemmi-Iperlemmi.py

@@ -0,0 +1,56 @@
+import csv
+import codecs
+import pandas as pd
+import re
+import os
+import io
+import tokenize
+
+results = []
+with open('/Users/alessiaspadi/Documents/RESTORE/temp_ovi/assoc_Data.csv') as File:
+    reader = csv.reader(File)
+    for row in reader:
+        results.append(row)
+
+lemmi = []
+iperlemmi = []
+
+df = pd.read_csv('/Users/alessiaspadi/Documents/RESTORE/temp_ovi/lem_Data.csv', sep=';')
+for x in range(len(df)):
+    lemmi.append([df.sigla[x], df.num[x], df.lemma[x]])
+
+cf = pd.read_csv('/Users/alessiaspadi/Documents/RESTORE/temp_ovi/iperlemmi_Datini.csv', sep=',')
+for x in range(len(cf)):
+    iperlemmi.append([cf.sigla[x], cf.num[x], cf.iperlemma[x]])
+
+lip_data = open('lip_Data.csv', 'w')
+csvwriter = csv.writer(lip_data)
+
+params = ["sigla", "n_lemma", "n_iperlemma", "lemma", "iperlemma"]
+csvwriter.writerow(params)
+
+
+for r in results:
+    sigla = r[0]
+    num = int(r[1])
+    ip = int(r[2])
+    row = [sigla, num, ip]
+    file_name = "lemmi." + sigla + ".txt"
+    f = open('/Users/alessiaspadi/Documents/RESTORE/temp_ovi/lemmi/' + file_name, "r", encoding='latin-1')
+    lines = f.readlines()
+    for line in lines:
+        if "IPERLEMMA" not in line:
+            lem = re.split('\|', line)
+            nn = int(lem[0])
+            if nn == num:
+                row.append(lem[1])
+        else:
+            iplem = re.split('\|', line)
+            np = iplem[0].replace("IPERLEMMA", " ")
+            mp = int(np)
+            if ip == mp:
+                row.append(iplem[1])
+    csvwriter.writerow(row)
+
+f.close()
+lip_data.close()

+ 72 - 0
OVI/confrontoDatiOVI.py

@@ -0,0 +1,72 @@
+import os
+import xml.etree.ElementTree as Xet
+import re
+from xml.etree import ElementTree
+import csv
+
+nolemmi = []
+basepath_nolemmi = '/Users/alessiaspadi/Documents/RESTORE/temp_ovi/DatiniXML_incompleto'
+for entry in os.listdir(basepath_nolemmi):
+    if os.path.isfile(os.path.join(basepath_nolemmi, entry)):
+        nn = entry.split('.')[1].lstrip().split(' ')[0]
+        if nn != '':
+            nolemmi.append([nn, entry])
+
+lemmi = []
+basepath_lemmi = '/Users/alessiaspadi/Documents/RESTORE/temp_ovi/lemmi'
+for entry in os.listdir(basepath_lemmi):
+    if os.path.isfile(os.path.join(basepath_lemmi, entry)):
+        ll = entry.split('.')[1].lstrip().split(' ')[0]
+        if ll != '':
+            lemmi.append([ll, entry])
+
+gat = []
+basepath_gat = '/Users/alessiaspadi/Documents/RESTORE/temp_ovi/xmlgat'
+for entry in os.listdir(basepath_gat):
+    if os.path.isfile(os.path.join(basepath_gat, entry)):
+        gg = entry.split('.')[1].lstrip().split(' ')[0]
+        if gg != '':
+            gat.append([gg, entry])
+
+xmlparse = Xet.parse('/Users/alessiaspadi/Documents/RESTORE/temp_ovi/BiblioDatini.xml')
+root = xmlparse.getroot()
+biblio = root.findall("Biblio")
+
+sigle = []
+for bib in biblio:
+    sigla = bib.find("sigla")
+    sigle.append(sigla.text.lower())
+
+
+print (nolemmi)
+print (lemmi)
+print (gat)
+print (sigle)
+
+OVI_data = open('OVI_Data.csv', 'w')
+csvwriter = csv.writer(OVI_data)
+
+params = ["BiblioDatini", "nolemmi", "lemmi", "xmlgat"]
+
+csvwriter.writerow(params)
+
+for sigla in sigle:
+    row = [sigla]
+    no_lemma = " "
+    lemma = " "
+    gatto = " "
+    for x in range(len(nolemmi)):
+        if sigla in nolemmi[x][0]:
+            no_lemma = nolemmi[x][1]
+    row.append(no_lemma)
+    for x in range(len(lemmi)):
+        if sigla == lemmi[x][0]:
+            lemma = lemmi[x][1]
+    row.append(lemma)
+    for x in range(len(gat)):
+        if sigla == gat[x][0]:
+            gatto = gat[x][1]
+    row.append(gatto)
+    csvwriter.writerow(row)
+
+OVI_data.close()