{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import xml.etree.ElementTree as ET\n", "import os\n", "import csv\n", "from collections import OrderedDict" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "baseDir = '/home/kora/Desktop/OVI_Data_local/Dati nuOVI (fine Giugno 21)/datiniXML/'" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "gat = []\n", "basepath_gat = baseDir + 'xmlgat'\n", "for entry in os.listdir(basepath_gat):\n", " if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant\n", " gg = entry.split('.')[1]\n", " if gg != '':\n", " gat.append([gg, entry])\n", "\n", "\n", "# NON HO AL MOMENTO A DISPOSIZIONE I 'nolemmi'\n", " \n", "#nolemmi = []\n", "#basepath_nolemmi = baseDir + 'DatiniXML_incompleto'\n", "#for entry in os.listdir(basepath_nolemmi):\n", "# if os.path.isfile(os.path.join(basepath_nolemmi, entry)):\n", "# nn = entry.split('.')[1]\n", "# if nn != '':\n", "# nolemmi.append([nn, entry])\n", "\n", "lemmi = []\n", "basepath_lemmi = baseDir + 'lemmi'\n", "for entry in os.listdir(basepath_lemmi):\n", " if os.path.isfile(os.path.join(basepath_lemmi, entry)):\n", " ll = entry.split('.')[1]\n", " if ll != '':\n", " lemmi.append([ll, entry])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Non mi è chiaro a cosa servano le **tre** celle seguenti -- in ogni caso, non ho a disposizione BiblioDatini.xml nella versione corrente dei dati OVI." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')\n", "root = xmlparse.getroot()\n", "biblio = root.findall(\"Biblio\")\n", "\n", "sigle = []\n", "for bib in biblio:\n", " sigla = bib.find(\"sigla\")\n", " sigle.append(sigla.text.lower())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "OVI_data = open(baseDir + 'OVI_Data.csv', 'w')\n", "csvwriter = csv.writer(OVI_data)\n", "\n", "params = [\"BiblioDatini\", \"nolemmi\", \"lemmi\", \"xmlgat\"]\n", "\n", "csvwriter.writerow(params)\n", "\n", "for sigla in sigle:\n", " row = [sigla]\n", " no_lemma = \" \"\n", " lemma = \" \"\n", " gatto = \" \"\n", " for x in range(len(nolemmi)):\n", " if sigla in nolemmi[x][0]:\n", " no_lemma = nolemmi[x][1]\n", " row.append(no_lemma)\n", " for x in range(len(lemmi)):\n", " if sigla == lemmi[x][0]:\n", " lemma = lemmi[x][1]\n", " row.append(lemma)\n", " for x in range(len(gat)):\n", " if sigla == gat[x][0]:\n", " gatto = gat[x][1]\n", " row.append(gatto)\n", " csvwriter.writerow(row)\n", "\n", "OVI_data.close()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#Cambiare percorsi file\n", "#xml_file_name = 'Ovi/BiblioDatini.xml'\n", "#tree = Xet.parse(xml_file_name)\n", "#root = tree.getroot() --> già definito\n", "#biblio = root.findall(\"Biblio\") --> già definito\n", "\n", "\n", "Datini_data = open(baseDir + 'Datini_Data.csv', 'w')\n", "csvwriter = csv.writer(Datini_data)\n", "\n", "elemList = []\n", "\n", "for elem in root.iter():\n", " elemList.append(elem.tag)\n", "\n", "\n", "elemList = list(set(elemList))\n", "\n", "elemList.remove(\"dataroot\")\n", "elemList.remove(\"Biblio\")\n", "\n", "param = elemList\n", "\n", "csvwriter.writerow(param)\n", "\n", "\n", "def cell(p, arr):\n", " if arr.find(p) is None:\n", " res = \" \"\n", " else:\n", " res = arr.find(p).text\n", " return res\n", "\n", "\n", "for scheda in biblio:\n", " aut = []\n", "\n", " for par in param:\n", " if par == \"star_note\":\n", " if scheda.find(\".//star_note\") is None:\n", " r = \" \"\n", " else:\n", " r = \"True\"\n", " else:\n", " r = cell(par, scheda)\n", " aut.append(r)\n", "\n", " csvwriter.writerow(aut)\n", "\n", "Datini_data.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Si riprende da qui" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "\n", "iperlem_data = open(baseDir + 'lem_Data.csv', 'w')\n", "csvwriter = csv.writer(iperlem_data)\n", "\n", "params = [\"sigla\", \"file\", \"num\", \"lemma\", \"commento\", \"livello\"]\n", "csvwriter.writerow(params)\n", "\n", "\n", "def write_lines(lines, sig, file):\n", " for line in lines:\n", " row = [sig, file]\n", " lem = re.split('\\|', line)\n", " for l in lem:\n", " m = l.strip()\n", " row.append(m)\n", "# print(row)\n", " csvwriter.writerow(row)\n", "\n", "\n", "for x in range(len(lemmi)):\n", " sigla = lemmi[x][0]\n", " file_name = lemmi[x][1]\n", " #Cambia percorso\n", " f = open(baseDir + 'lemmi/' + file_name, \"r\", encoding='latin-1')\n", " lines = f.readlines()\n", " write_lines(lines, sigla, file_name)\n", "\n", "\n", "f.close()\n", "\n", "iperlem_data.close()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "110829\n" ] } ], "source": [ "redundantLemmas = []\n", "\n", "def write_lines_here(lines, sig):\n", " toRet = []\n", " for line in lines:\n", " row = [sig]\n", " lem = re.split('\\|', line)\n", " for l in lem:\n", " m = l.strip()\n", " row.append(m)\n", " toRet.append(row)\n", " return toRet\n", "\n", "\n", "for x in range(len(lemmi)):\n", " sigla = lemmi[x][0]\n", " file_name = lemmi[x][1]\n", " #Cambia percorso\n", " f = open(baseDir + 'lemmi/' + file_name, \"r\", encoding='latin-1')\n", " lines = f.readlines()\n", " redundantLemmas = redundantLemmas + write_lines_here(lines, sigla)\n", "\n", "\n", "f.close()\n", "\n", "print(len(redundantLemmas))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check di cosa viene fuori dalla lettura" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['b60', '1', 'denaro', 's.m.', '']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "redundantLemmas[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Esporto il lemmario" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "90150\n", "7591\n" ] } ], "source": [ "preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas))\n", "prefinal = list(map(lambda row: (row[2], row[3], row[4]), preprefinal))\n", "\n", "print(len(prefinal))\n", "\n", "final = list(set(prefinal))\n", "final.sort()\n", "\n", "print(len(final))\n", "\n", "\n", "# IPERLEMMI\n", "\n", "preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas))\n", "prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER))\n", "\n", "finalIPER = list(set(prefinalIPER))\n", "finalIPER.sort()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "lem_data_unique = open(baseDir + 'lem_unique.csv', 'w')\n", "csvwriter = csv.writer(lem_data_unique)\n", "\n", "csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])\n", "\n", "for line in final:\n", " csvwriter.writerow(line)\n", "\n", "lem_data_unique.close()\n", "\n", "\n", "# IPERLEMMI\n", "\n", "iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w')\n", "csvwriter = csv.writer(iperlem_data_unique)\n", "\n", "csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note'])\n", "\n", "for line in finalIPER:\n", " #print(line)\n", " csvwriter.writerow(line)\n", "\n", "iperlem_data_unique.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Categorie grammaticali ed export ordinato" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cat_gramm = set(map(lambda entry: entry[1], final))\n", "cat_gramm2 = list(cat_gramm)\n", "cat_gramm2.sort()\n", "\n", "print(cat_gramm2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "byType = OrderedDict()\n", "\n", "for type1 in cat_gramm2:\n", " byType[type1] = list(filter(lambda a: a[1]==type1, final))\n", "\n", "\n", "lem_data_byCat = open(baseDir + 'lem_unique_byCat.csv', 'w')\n", "csvwriter = csv.writer(lem_data_byCat)\n", "\n", "csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])\n", "\n", "for type1 in cat_gramm2:\n", " print(type1)\n", " for line in byType[type1]:\n", " csvwriter.writerow(line)\n", "\n", "lem_data_byCat.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Microprova di modifica + export di xml (modifico un attribute di un tag).\n", "\n", "Come prima cosa, provo a recuperare la lista dei lemmi di un singolo file, e a rintracciare quel lemma e il suo ID (numero d'ordine) nel lemmario." ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'n': '6', 'type': '1'}\n", "lem\n", "Lemma: ['d16', '6', 'maggio', 's.m.', '']\n", "New ID: 4247\n", "\n", "{'n': '13', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '13', 'simona (donna di piero di paolo rinaldeschi)', 'antr.', '']\n", "New ID: 6666\n", "\n", "{'n': '7', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '7', 'famiglio', 's.m.', '']\n", "New ID: 2514\n", "\n", "{'n': '18', 'type': '2'}\n", "lem\n", "Lemma: ['d16', '18', 'oncia', 's.f.', '']\n", "New ID: 5069\n", "\n", "{'n': '20', 'type': '4'}\n", "lem\n", "Lemma: ['d16', '20', 'bottoncino', 's.m.', '']\n", "New ID: 1255\n", "\n", "{'n': '22', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '22', 'infilare', 'v.', '']\n", "New ID: 3786\n", "\n", "{'n': '9', 'type': '2'}\n", "lem\n", "Lemma: ['d16', '9', 'braccio', 's.m.', '']\n", "New ID: 1269\n", "\n", "{'n': '19', 'type': '4'}\n", "lem\n", "Lemma: ['d16', '19', 'frangia', 's.f.', '']\n", "New ID: 2907\n", "\n", "{'n': '8', 'type': '3'}\n", "lem\n", "Lemma: ['d16', '8', 'nero', 'agg./s.m.', '']\n", "New ID: 4860\n", "\n", "{'n': '15', 'type': '3'}\n", "lem\n", "Lemma: ['d16', '15', 'azzurro', 'agg./s.m.', '']\n", "New ID: 709\n", "\n", "{'n': '16', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '16', 'nannino (manovale)', 'antr.', '']\n", "New ID: 4807\n", "\n", "{'n': '12', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '12', 'san bartolomeo (chiesa e convento di)', 'n.g.', 'a prato (convento di carmelitani)']\n", "New ID: 6303\n", "\n", "{'n': '14', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '14', 'nanni di luca da santa chiara', 'antr.', '']\n", "New ID: 4789\n", "\n", "{'n': '11', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '11', 'garzone', 's.m.', '']\n", "New ID: 3021\n", "\n", "{'n': '21', 'type': '1'}\n", "lem\n", "Lemma: ['d16', '21', 'nona', 's.f.', '']\n", "New ID: 5025\n", "\n", "{'n': '10', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '10', 'domenica', 's.f.', '']\n", "New ID: 2369\n", "\n", "{'n': '3', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '3', 'sere', 's.m.', '']\n", "New ID: 6611\n", "\n", "{'n': '17', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '17', 'clemente di niccolò di piero', 'antr.', '']\n", "New ID: 1900\n", "\n", "{'n': '14', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '14', 'nanni di luca da santa chiara', 'antr.', '']\n", "New ID: 4789\n", "\n", "{'n': '1', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '1', 'margherita di domenico bandini', 'antr.', '']\n", "New ID: 4378\n", "\n", "{'n': '5', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '5', 'prato', 'n.g.', '']\n", "New ID: 5821\n", "\n", "{'n': '4', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '4', 'francesco di marco datini', 'antr.', '']\n", "New ID: 2864\n", "\n", "{'n': '5', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '5', 'prato', 'n.g.', '']\n", "New ID: 5821\n", "\n", "{'n': '2', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '2', 'firenze', 'n.g.', '']\n", "New ID: 2708\n", "\n", "{'n': '5', 'type': '0'}\n", "lem\n", "Lemma: ['d16', '5', 'prato', 'n.g.', '']\n", "New ID: 5821\n", "\n", "{'n': '6', 'type': '1'}\n", "lem\n", "Lemma: ['d16', '6', 'maggio', 's.m.', '']\n", "New ID: 4247\n", "\n" ] } ], "source": [ "ii = 2\n", "\n", "smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])\n", "smallroot = smalltree.getroot()\n", "\n", "localLemNodes = smallroot.iter('lem')\n", "\n", "for node in localLemNodes:\n", " print(node.attrib)\n", " print(node.tag)\n", " #\n", " thisLemma = list(filter(lambda lem: lem[0]==gat[ii][0] and lem[1]==node.attrib['n'], redundantLemmas))[0]\n", " newID = final.index((thisLemma[2], thisLemma[3], thisLemma[4]))\n", " #\n", " print('Lemma: ', thisLemma)\n", " print('New ID: ', newID)\n", " print()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Qui faccio una prova di modifica del singolo file e di export in nuovo xml" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "ii = 2\n", "\n", "smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])\n", "smallroot = smalltree.getroot()\n", "\n", "for node in smallroot.iter('lem'):\n", " node.set('n', '100h')\n", " \n", "smalltree.write(baseDir + 'prova.xml')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Infine provo un loop su tutti i file, li elaboro e li esporto in una nuova cartella (che va creata a mano) 'newxmlgat'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parsing original xml file: xmlgat.k01.txt failed\n", "Parsing original xml file: xmlgat.j99.txt failed\n", "Parsing original xml file: xmlgat.c13.txt failed\n" ] } ], "source": [ "for ii in range(len(gat)):\n", " try:\n", " smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])\n", " smallroot = smalltree.getroot()\n", " #\n", " localLemNodes = smallroot.iter('lem')\n", " #\n", " for node in localLemNodes:\n", " try:\n", " thisLemma = list(filter(lambda lem: lem[0]==gat[ii][0] and lem[1]==node.attrib['n'], redundantLemmas))[0]\n", " newID = final.index((thisLemma[2], thisLemma[3], thisLemma[4]))\n", " #\n", " node.set('n', str(newID))\n", " except Exception as e:\n", " print('In looking for lemma:')\n", " print(e)\n", " try:\n", " smalltree.write(baseDir + 'newxmlgat/'+'newxmlgat.'+gat[ii][0]+'.xml')\n", " except Exception as e:\n", " print('In Export:')\n", " print(e)\n", " except:\n", " print('Parsing original xml file: ', gat[ii][1], ' failed' )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }