{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import xml.etree.ElementTree as ET\n", "import os\n", "import csv\n", "from collections import OrderedDict" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "baseDir = '/Users/federicaspinelli/TEAMOVI/Parser/Data/DallOVI/datiniXML/'" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "gat = []\n", "basepath_gat = baseDir + 'xmlgat'\n", "for entry in os.listdir(basepath_gat):\n", " if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant\n", " gg = entry.split('.')[1]\n", " if gg != '':\n", " gat.append([gg, entry])\n", "\n", "lemmi = []\n", "basepath_lemmi = baseDir + 'lemmi_txt'\n", "for entry in os.listdir(basepath_lemmi):\n", " if os.path.isfile(os.path.join(basepath_lemmi, entry)):\n", " ll = entry.split('.')[1]\n", " if ll != '':\n", " lemmi.append([ll, entry])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Leggo il file BiblioDatini.xml ed estraggo tutte le sigle." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')\n", "root = xmlparse.getroot()\n", "biblio = root.findall(\"Biblio\")\n", "\n", "sigle = []\n", "for bib in biblio:\n", " sigla = bib.find(\"sigla\")\n", " sigle.append(sigla.text.lower())" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "OVI_data = open(baseDir + 'Corrispondenza_SIGLA_XML.csv', 'w')\n", "csvwriter = csv.writer(OVI_data)\n", "\n", "params = [\"SIGLA\", \"TXT\", \"XML\"]\n", "\n", "csvwriter.writerow(params)\n", "\n", "for sigla in sigle:\n", " row = [sigla]\n", " no_lemma = \" \"\n", " lemma = \" \"\n", " gatto = \" \"\n", "# for x in range(len(nolemmi)):\n", "# if sigla in nolemmi[x][0]:\n", "# no_lemma = nolemmi[x][1]\n", " row.append(no_lemma)\n", " for x in range(len(lemmi)):\n", " if sigla == lemmi[x][0]:\n", " lemma = lemmi[x][1]\n", " row.append(lemma)\n", " for x in range(len(gat)):\n", " if sigla == gat[x][0]:\n", " gatto = gat[x][1]\n", " row.append(gatto)\n", " csvwriter.writerow(row)\n", "\n", "OVI_data.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "iperlem_data = open(baseDir + 'data_lemmi.csv', 'w')\n", "csvwriter = csv.writer(iperlem_data)\n", "\n", "params = [\"sigla\", \"file_txt\" , \"numero\", \"lemma\", \"categoria grammaticale\", \"livello\"]\n", "csvwriter.writerow(params)\n", "\n", "def write_lines(lines, sig, file):\n", " for line in lines:\n", " row = [sig, file]\n", " lem = re.split('\\|', line)\n", " for l in lem:\n", " m = l.strip()\n", " row.append(m)\n", " #print(row)\n", " csvwriter.writerow(row)\n", "\n", "for x in range(len(lemmi)):\n", " sigla = lemmi[x][0]\n", " file_name = lemmi[x][1]\n", " f = open(baseDir + 'lemmi_txt/' + file_name, \"r\", encoding='latin-1')\n", " lines = f.readlines()\n", " write_lines(lines, sigla, file_name)\n", "\n", "\n", "f.close()\n", "\n", "iperlem_data.close()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "110829\n" ] } ], "source": [ "redundantLemmas = []\n", "\n", "def write_lines_here(lines, sig):\n", " toRet = []\n", " for line in lines:\n", " row = [sig]\n", " lem = re.split('\\|', line)\n", " for l in lem:\n", " m = l.strip()\n", " row.append(m)\n", " toRet.append(row)\n", " return toRet\n", "\n", "\n", "for x in range(len(lemmi)):\n", " sigla = lemmi[x][0]\n", " file_name = lemmi[x][1]\n", " #Cambia percorso\n", " f = open(baseDir + 'lemmi_txt/' + file_name, \"r\", encoding='latin-1')\n", " lines = f.readlines()\n", " redundantLemmas = redundantLemmas + write_lines_here(lines, sigla)\n", "\n", "\n", "f.close()\n", "\n", "print(len(redundantLemmas))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check di cosa viene fuori dalla lettura" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['l95', '1', 'pagare', 'v.', '']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "redundantLemmas[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Esporto il lemmario" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "90150\n", "7591\n" ] } ], "source": [ "preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas))\n", "prefinal = list(map(lambda row: (row[2], row[3], row[4]), preprefinal))\n", "\n", "print(len(prefinal))\n", "\n", "final = list(set(prefinal))\n", "final.sort()\n", "\n", "print(len(final))\n", "\n", "\n", "# IPERLEMMI\n", "\n", "preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas))\n", "prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER))\n", "\n", "finalIPER = list(set(prefinalIPER))\n", "finalIPER.sort()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "lem_data_unique = open(baseDir + 'lem_unique.csv', 'w')\n", "csvwriter = csv.writer(lem_data_unique)\n", "\n", "csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])\n", "\n", "for line in final:\n", " csvwriter.writerow(line)\n", "\n", "lem_data_unique.close()\n", "\n", "\n", "# IPERLEMMI\n", "\n", "iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w')\n", "csvwriter = csv.writer(iperlem_data_unique)\n", "\n", "csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note'])\n", "\n", "for line in finalIPER:\n", " #print(line)\n", " csvwriter.writerow(line)\n", "\n", "iperlem_data_unique.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Categorie grammaticali ed export ordinato" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['agg.', 'agg./s.m.', 'antr.', 'avv.', 'n.g.', 'n.p.', 's.f.', 's.f./s.m.', 's.f.pl.', 's.i.', 's.m.', 's.m./s.f.', 's.m.pl.', 'v.']\n" ] } ], "source": [ "cat_gramm = set(map(lambda entry: entry[1], final))\n", "cat_gramm2 = list(cat_gramm)\n", "cat_gramm2.sort()\n", "\n", "print(cat_gramm2)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "agg.\n", "agg./s.m.\n", "antr.\n", "avv.\n", "n.g.\n", "n.p.\n", "s.f.\n", "s.f./s.m.\n", "s.f.pl.\n", "s.i.\n", "s.m.\n", "s.m./s.f.\n", "s.m.pl.\n", "v.\n" ] } ], "source": [ "byType = OrderedDict()\n", "\n", "for type1 in cat_gramm2:\n", " byType[type1] = list(filter(lambda a: a[1]==type1, final))\n", "\n", "\n", "lem_data_byCat = open(baseDir + 'lem_unique_byCat.csv', 'w')\n", "csvwriter = csv.writer(lem_data_byCat)\n", "\n", "csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])\n", "\n", "for type1 in cat_gramm2:\n", " print(type1)\n", " for line in byType[type1]:\n", " csvwriter.writerow(line)\n", "\n", "lem_data_byCat.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Microprova di modifica + export di xml (modifico un attribute di un tag).\n", "\n", "Come prima cosa, provo a recuperare la lista dei lemmi di un singolo file, e a rintracciare quel lemma e il suo ID (numero d'ordine) nel lemmario." ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'n': '6', 'type': '1'}\n", "lem\n", "Lemma: ['71d', '6', 'novembre', 's.m.', '']\n", "New ID: 5039\n", "\n", "{'n': '4', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '4', 'firenze', 'n.g.', '']\n", "New ID: 2708\n", "\n", "{'n': '25', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '25', 'collardo di callevilla (governatore di genova)', 'antr.', '']\n", "New ID: 1925\n", "\n", "{'n': '13', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '13', 'portovenere', 'n.g.', '']\n", "New ID: 5809\n", "\n", "{'n': '3', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '3', 'genova', 'n.g.', '']\n", "New ID: 3046\n", "\n", "{'n': '8', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '8', 'console', 's.m.', '']\n", "New ID: 2017\n", "\n", "{'n': '26', 'type': '7'}\n", "lem\n", "Lemma: ['71d', '26', 'artefice', 's.m.', '']\n", "New ID: 650\n", "\n", "{'n': '27', 'type': '8'}\n", "lem\n", "Lemma: ['71d', '27', 'chiesa', 's.f.', '']\n", "New ID: 1811\n", "\n", "{'n': '2', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '2', 'città', 's.f.', '']\n", "New ID: 1889\n", "\n", "{'n': '18', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '18', 'palazzo', 's.m.', '']\n", "New ID: 5167\n", "\n", "{'n': '25', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '25', 'collardo di callevilla (governatore di genova)', 'antr.', '']\n", "New ID: 1925\n", "\n", "{'n': '24', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '24', 'consiglio', 's.m.', '']\n", "New ID: 2015\n", "\n", "{'n': '22', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '22', 'domenica', 's.f.', '']\n", "New ID: 2369\n", "\n", "{'n': '6', 'type': '1'}\n", "lem\n", "Lemma: ['71d', '6', 'novembre', 's.m.', '']\n", "New ID: 5039\n", "\n", "{'n': '25', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '25', 'collardo di callevilla (governatore di genova)', 'antr.', '']\n", "New ID: 1925\n", "\n", "{'n': '24', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '24', 'consiglio', 's.m.', '']\n", "New ID: 2015\n", "\n", "{'n': '8', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '8', 'console', 's.m.', '']\n", "New ID: 2017\n", "\n", "{'n': '28', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '28', 'arte', 's.f.', '']\n", "New ID: 635\n", "\n", "{'n': '29', 'type': '6'}\n", "lem\n", "Lemma: ['71d', '29', 'conestabile', 's.m.', '']\n", "New ID: 2005\n", "\n", "{'n': '18', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '18', 'palazzo', 's.m.', '']\n", "New ID: 5167\n", "\n", "{'n': '25', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '25', 'collardo di callevilla (governatore di genova)', 'antr.', '']\n", "New ID: 1925\n", "\n", "{'n': '22', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '22', 'domenica', 's.f.', '']\n", "New ID: 2369\n", "\n", "{'n': '26', 'type': '7'}\n", "lem\n", "Lemma: ['71d', '26', 'artefice', 's.m.', '']\n", "New ID: 650\n", "\n", "{'n': '17', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '17', 'guelfo', 'agg./s.m.', '']\n", "New ID: 3586\n", "\n", "{'n': '16', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '16', 'ghibellino', 'agg./s.m.', '']\n", "New ID: 3079\n", "\n", "{'n': '24', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '24', 'consiglio', 's.m.', '']\n", "New ID: 2015\n", "\n", "{'n': '24', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '24', 'consiglio', 's.m.', '']\n", "New ID: 2015\n", "\n", "{'n': '25', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '25', 'collardo di callevilla (governatore di genova)', 'antr.', '']\n", "New ID: 1925\n", "\n", "{'n': '24', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '24', 'consiglio', 's.m.', '']\n", "New ID: 2015\n", "\n", "{'n': '23', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '23', 'eleggere', 'v.', '']\n", "New ID: 2457\n", "\n", "{'n': '26', 'type': '7'}\n", "lem\n", "Lemma: ['71d', '26', 'artefice', 's.m.', '']\n", "New ID: 650\n", "\n", "{'n': '28', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '28', 'arte', 's.f.', '']\n", "New ID: 635\n", "\n", "{'n': '28', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '28', 'arte', 's.f.', '']\n", "New ID: 635\n", "\n", "{'n': '17', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '17', 'guelfo', 'agg./s.m.', '']\n", "New ID: 3586\n", "\n", "{'n': '16', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '16', 'ghibellino', 'agg./s.m.', '']\n", "New ID: 3079\n", "\n", "{'n': '5', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '5', 'messere', 's.m.', '']\n", "New ID: 4569\n", "\n", "{'n': '25', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '25', 'collardo di callevilla (governatore di genova)', 'antr.', '']\n", "New ID: 1925\n", "\n", "{'n': '12', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '12', 'podestà', 's.m.', '']\n", "New ID: 5720\n", "\n", "{'n': '30', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '30', 'corona', 's.f.', '']\n", "New ID: 2096\n", "\n", "{'n': '8', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '8', 'console', 's.m.', '']\n", "New ID: 2017\n", "\n", "{'n': '28', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '28', 'arte', 's.f.', '']\n", "New ID: 635\n", "\n", "{'n': '7', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '7', 'lunedì', 's.m.', '']\n", "New ID: 4212\n", "\n", "{'n': '26', 'type': '7'}\n", "lem\n", "Lemma: ['71d', '26', 'artefice', 's.m.', '']\n", "New ID: 650\n", "\n", "{'n': '28', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '28', 'arte', 's.f.', '']\n", "New ID: 635\n", "\n", "{'n': '8', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '8', 'console', 's.m.', '']\n", "New ID: 2017\n", "\n", "{'n': '23', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '23', 'eleggere', 'v.', '']\n", "New ID: 2457\n", "\n", "{'n': '8', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '8', 'console', 's.m.', '']\n", "New ID: 2017\n", "\n", "{'n': '23', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '23', 'eleggere', 'v.', '']\n", "New ID: 2457\n", "\n", "{'n': '25', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '25', 'collardo di callevilla (governatore di genova)', 'antr.', '']\n", "New ID: 1925\n", "\n", "{'n': '24', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '24', 'consiglio', 's.m.', '']\n", "New ID: 2015\n", "\n", "{'n': '23', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '23', 'eleggere', 'v.', '']\n", "New ID: 2457\n", "\n", "{'n': '1', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '1', 'denaro', 's.m.', '']\n", "New ID: 2301\n", "\n", "{'n': '2', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '2', 'città', 's.f.', '']\n", "New ID: 1889\n", "\n", "{'n': '14', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '14', 're', 's.m.', '']\n", "New ID: 6006\n", "\n", "{'n': '31', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '31', 'governatore', 's.m.', '']\n", "New ID: 3472\n", "\n", "{'n': '17', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '17', 'guelfo', 'agg./s.m.', '']\n", "New ID: 3586\n", "\n", "{'n': '16', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '16', 'ghibellino', 'agg./s.m.', '']\n", "New ID: 3079\n", "\n", "{'n': '2', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '2', 'città', 's.f.', '']\n", "New ID: 1889\n", "\n", "{'n': '15', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '15', 'riviera ligure', 'n.g.', '']\n", "New ID: 6134\n", "\n", "{'n': '2', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '2', 'città', 's.f.', '']\n", "New ID: 1889\n", "\n", "{'n': '3', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '3', 'genova', 'n.g.', '']\n", "New ID: 3046\n", "\n", "{'n': '2', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '2', 'città', 's.f.', '']\n", "New ID: 1889\n", "\n", "{'n': '2', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '2', 'città', 's.f.', '']\n", "New ID: 1889\n", "\n", "{'n': '11', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '11', 'marina', 's.f.', '']\n", "New ID: 4384\n", "\n", "{'n': '32', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '32', 'motrone', 'n.g.', '']\n", "New ID: 4718\n", "\n", "{'n': '3', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '3', 'genova', 'n.g.', '']\n", "New ID: 3046\n", "\n", "{'n': '10', 'type': '2'}\n", "lem\n", "Lemma: ['71d', '10', 'galea', 's.f.', '']\n", "New ID: 2982\n", "\n", "{'n': '9', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '9', 'armato', 'agg.', '']\n", "New ID: 592\n", "\n", "{'n': '20', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '20', 'inverno', 's.m.', '']\n", "New ID: 3818\n", "\n", "{'n': '9', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '9', 'armato', 'agg.', '']\n", "New ID: 592\n", "\n", "{'n': '33', 'type': '2'}\n", "lem\n", "Lemma: ['71d', '33', 'marinaio', 's.m.', '']\n", "New ID: 4385\n", "\n", "{'n': '3', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '3', 'genova', 'n.g.', '']\n", "New ID: 3046\n", "\n", "{'n': '6', 'type': '1'}\n", "lem\n", "Lemma: ['71d', '6', 'novembre', 's.m.', '']\n", "New ID: 5039\n", "\n", "{'n': '21', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '21', 'piero di giusto benintendi', 'antr.', '']\n", "New ID: 5596\n", "\n", "{'n': '19', 'type': '0'}\n", "lem\n", "Lemma: ['71d', '19', 'piazza', 's.f.', '']\n", "New ID: 5514\n", "\n", "{'n': '18', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '18', 'palazzo', 's.m.', '']\n", "New ID: 5167\n", "\n", "{'n': '12', 'type': '4'}\n", "lem\n", "Lemma: ['71d', '12', 'podestà', 's.m.', '']\n", "New ID: 5720\n", "\n" ] } ], "source": [ "ii = 2\n", "\n", "smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])\n", "smallroot = smalltree.getroot()\n", "localLemNodes = smallroot.iter('lem')\n", "\n", "for node in localLemNodes:\n", " print(node.attrib)\n", " print(node.tag)\n", " #\n", " thisLemma = list(filter(lambda lem: lem[0]==gat[ii][0] and lem[1]==node.attrib['n'], redundantLemmas))[0]\n", " newID = final.index((thisLemma[2], thisLemma[3], thisLemma[4]))\n", " #\n", " print('Lemma: ', thisLemma)\n", " print('New ID: ', newID)\n", " print()\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.0" } }, "nbformat": 4, "nbformat_minor": 2 }