|
@@ -0,0 +1,683 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 1,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "import xml.etree.ElementTree as ET\n",
|
|
|
+ "import os\n",
|
|
|
+ "import csv\n",
|
|
|
+ "from collections import OrderedDict"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 4,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "baseDir = '/home/kora/Desktop/OVI_Data_local/Dati nuOVI (fine Giugno 21)/datiniXML/'"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 5,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "gat = []\n",
|
|
|
+ "basepath_gat = baseDir + 'xmlgat'\n",
|
|
|
+ "for entry in os.listdir(basepath_gat):\n",
|
|
|
+ " if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant\n",
|
|
|
+ " gg = entry.split('.')[1]\n",
|
|
|
+ " if gg != '':\n",
|
|
|
+ " gat.append([gg, entry])\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "# NON HO AL MOMENTO A DISPOSIZIONE I 'nolemmi'\n",
|
|
|
+ " \n",
|
|
|
+ "#nolemmi = []\n",
|
|
|
+ "#basepath_nolemmi = baseDir + 'DatiniXML_incompleto'\n",
|
|
|
+ "#for entry in os.listdir(basepath_nolemmi):\n",
|
|
|
+ "# if os.path.isfile(os.path.join(basepath_nolemmi, entry)):\n",
|
|
|
+ "# nn = entry.split('.')[1]\n",
|
|
|
+ "# if nn != '':\n",
|
|
|
+ "# nolemmi.append([nn, entry])\n",
|
|
|
+ "\n",
|
|
|
+ "lemmi = []\n",
|
|
|
+ "basepath_lemmi = baseDir + 'lemmi'\n",
|
|
|
+ "for entry in os.listdir(basepath_lemmi):\n",
|
|
|
+ " if os.path.isfile(os.path.join(basepath_lemmi, entry)):\n",
|
|
|
+ " ll = entry.split('.')[1]\n",
|
|
|
+ " if ll != '':\n",
|
|
|
+ " lemmi.append([ll, entry])\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "Non mi è chiaro a cosa servano le **tre** celle seguenti -- in ogni caso, non ho a disposizione BiblioDatini.xml nella versione corrente dei dati OVI."
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 4,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')\n",
|
|
|
+ "root = xmlparse.getroot()\n",
|
|
|
+ "biblio = root.findall(\"Biblio\")\n",
|
|
|
+ "\n",
|
|
|
+ "sigle = []\n",
|
|
|
+ "for bib in biblio:\n",
|
|
|
+ " sigla = bib.find(\"sigla\")\n",
|
|
|
+ " sigle.append(sigla.text.lower())"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 5,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "OVI_data = open(baseDir + 'OVI_Data.csv', 'w')\n",
|
|
|
+ "csvwriter = csv.writer(OVI_data)\n",
|
|
|
+ "\n",
|
|
|
+ "params = [\"BiblioDatini\", \"nolemmi\", \"lemmi\", \"xmlgat\"]\n",
|
|
|
+ "\n",
|
|
|
+ "csvwriter.writerow(params)\n",
|
|
|
+ "\n",
|
|
|
+ "for sigla in sigle:\n",
|
|
|
+ " row = [sigla]\n",
|
|
|
+ " no_lemma = \" \"\n",
|
|
|
+ " lemma = \" \"\n",
|
|
|
+ " gatto = \" \"\n",
|
|
|
+ " for x in range(len(nolemmi)):\n",
|
|
|
+ " if sigla in nolemmi[x][0]:\n",
|
|
|
+ " no_lemma = nolemmi[x][1]\n",
|
|
|
+ " row.append(no_lemma)\n",
|
|
|
+ " for x in range(len(lemmi)):\n",
|
|
|
+ " if sigla == lemmi[x][0]:\n",
|
|
|
+ " lemma = lemmi[x][1]\n",
|
|
|
+ " row.append(lemma)\n",
|
|
|
+ " for x in range(len(gat)):\n",
|
|
|
+ " if sigla == gat[x][0]:\n",
|
|
|
+ " gatto = gat[x][1]\n",
|
|
|
+ " row.append(gatto)\n",
|
|
|
+ " csvwriter.writerow(row)\n",
|
|
|
+ "\n",
|
|
|
+ "OVI_data.close()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 6,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "#Cambiare percorsi file\n",
|
|
|
+ "#xml_file_name = 'Ovi/BiblioDatini.xml'\n",
|
|
|
+ "#tree = Xet.parse(xml_file_name)\n",
|
|
|
+ "#root = tree.getroot() --> già definito\n",
|
|
|
+ "#biblio = root.findall(\"Biblio\") --> già definito\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "Datini_data = open(baseDir + 'Datini_Data.csv', 'w')\n",
|
|
|
+ "csvwriter = csv.writer(Datini_data)\n",
|
|
|
+ "\n",
|
|
|
+ "elemList = []\n",
|
|
|
+ "\n",
|
|
|
+ "for elem in root.iter():\n",
|
|
|
+ " elemList.append(elem.tag)\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "elemList = list(set(elemList))\n",
|
|
|
+ "\n",
|
|
|
+ "elemList.remove(\"dataroot\")\n",
|
|
|
+ "elemList.remove(\"Biblio\")\n",
|
|
|
+ "\n",
|
|
|
+ "param = elemList\n",
|
|
|
+ "\n",
|
|
|
+ "csvwriter.writerow(param)\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "def cell(p, arr):\n",
|
|
|
+ " if arr.find(p) is None:\n",
|
|
|
+ " res = \" \"\n",
|
|
|
+ " else:\n",
|
|
|
+ " res = arr.find(p).text\n",
|
|
|
+ " return res\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "for scheda in biblio:\n",
|
|
|
+ " aut = []\n",
|
|
|
+ "\n",
|
|
|
+ " for par in param:\n",
|
|
|
+ " if par == \"star_note\":\n",
|
|
|
+ " if scheda.find(\".//star_note\") is None:\n",
|
|
|
+ " r = \" \"\n",
|
|
|
+ " else:\n",
|
|
|
+ " r = \"True\"\n",
|
|
|
+ " else:\n",
|
|
|
+ " r = cell(par, scheda)\n",
|
|
|
+ " aut.append(r)\n",
|
|
|
+ "\n",
|
|
|
+ " csvwriter.writerow(aut)\n",
|
|
|
+ "\n",
|
|
|
+ "Datini_data.close()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "Si riprende da qui"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 6,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "import re\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "iperlem_data = open(baseDir + 'lem_Data.csv', 'w')\n",
|
|
|
+ "csvwriter = csv.writer(iperlem_data)\n",
|
|
|
+ "\n",
|
|
|
+ "params = [\"sigla\", \"file\", \"num\", \"lemma\", \"commento\", \"livello\"]\n",
|
|
|
+ "csvwriter.writerow(params)\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "def write_lines(lines, sig, file):\n",
|
|
|
+ " for line in lines:\n",
|
|
|
+ " row = [sig, file]\n",
|
|
|
+ " lem = re.split('\\|', line)\n",
|
|
|
+ " for l in lem:\n",
|
|
|
+ " m = l.strip()\n",
|
|
|
+ " row.append(m)\n",
|
|
|
+ "# print(row)\n",
|
|
|
+ " csvwriter.writerow(row)\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "for x in range(len(lemmi)):\n",
|
|
|
+ " sigla = lemmi[x][0]\n",
|
|
|
+ " file_name = lemmi[x][1]\n",
|
|
|
+ " #Cambia percorso\n",
|
|
|
+ " f = open(baseDir + 'lemmi/' + file_name, \"r\", encoding='latin-1')\n",
|
|
|
+ " lines = f.readlines()\n",
|
|
|
+ " write_lines(lines, sigla, file_name)\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "f.close()\n",
|
|
|
+ "\n",
|
|
|
+ "iperlem_data.close()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 7,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "110829\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "redundantLemmas = []\n",
|
|
|
+ "\n",
|
|
|
+ "def write_lines_here(lines, sig):\n",
|
|
|
+ " toRet = []\n",
|
|
|
+ " for line in lines:\n",
|
|
|
+ " row = [sig]\n",
|
|
|
+ " lem = re.split('\\|', line)\n",
|
|
|
+ " for l in lem:\n",
|
|
|
+ " m = l.strip()\n",
|
|
|
+ " row.append(m)\n",
|
|
|
+ " toRet.append(row)\n",
|
|
|
+ " return toRet\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "for x in range(len(lemmi)):\n",
|
|
|
+ " sigla = lemmi[x][0]\n",
|
|
|
+ " file_name = lemmi[x][1]\n",
|
|
|
+ " #Cambia percorso\n",
|
|
|
+ " f = open(baseDir + 'lemmi/' + file_name, \"r\", encoding='latin-1')\n",
|
|
|
+ " lines = f.readlines()\n",
|
|
|
+ " redundantLemmas = redundantLemmas + write_lines_here(lines, sigla)\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "f.close()\n",
|
|
|
+ "\n",
|
|
|
+ "print(len(redundantLemmas))"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "Check di cosa viene fuori dalla lettura"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 15,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "data": {
|
|
|
+ "text/plain": [
|
|
|
+ "['b60', '1', 'denaro', 's.m.', '']"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "execution_count": 15,
|
|
|
+ "metadata": {},
|
|
|
+ "output_type": "execute_result"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "redundantLemmas[0]"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "Esporto il lemmario"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 33,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "90150\n",
|
|
|
+ "7591\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas))\n",
|
|
|
+ "prefinal = list(map(lambda row: (row[2], row[3], row[4]), preprefinal))\n",
|
|
|
+ "\n",
|
|
|
+ "print(len(prefinal))\n",
|
|
|
+ "\n",
|
|
|
+ "final = list(set(prefinal))\n",
|
|
|
+ "final.sort()\n",
|
|
|
+ "\n",
|
|
|
+ "print(len(final))\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "# IPERLEMMI\n",
|
|
|
+ "\n",
|
|
|
+ "preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas))\n",
|
|
|
+ "prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER))\n",
|
|
|
+ "\n",
|
|
|
+ "finalIPER = list(set(prefinalIPER))\n",
|
|
|
+ "finalIPER.sort()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 34,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "lem_data_unique = open(baseDir + 'lem_unique.csv', 'w')\n",
|
|
|
+ "csvwriter = csv.writer(lem_data_unique)\n",
|
|
|
+ "\n",
|
|
|
+ "csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])\n",
|
|
|
+ "\n",
|
|
|
+ "for line in final:\n",
|
|
|
+ " csvwriter.writerow(line)\n",
|
|
|
+ "\n",
|
|
|
+ "lem_data_unique.close()\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "# IPERLEMMI\n",
|
|
|
+ "\n",
|
|
|
+ "iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w')\n",
|
|
|
+ "csvwriter = csv.writer(iperlem_data_unique)\n",
|
|
|
+ "\n",
|
|
|
+ "csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note'])\n",
|
|
|
+ "\n",
|
|
|
+ "for line in finalIPER:\n",
|
|
|
+ " #print(line)\n",
|
|
|
+ " csvwriter.writerow(line)\n",
|
|
|
+ "\n",
|
|
|
+ "iperlem_data_unique.close()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "Categorie grammaticali ed export ordinato"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "cat_gramm = set(map(lambda entry: entry[1], final))\n",
|
|
|
+ "cat_gramm2 = list(cat_gramm)\n",
|
|
|
+ "cat_gramm2.sort()\n",
|
|
|
+ "\n",
|
|
|
+ "print(cat_gramm2)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "byType = OrderedDict()\n",
|
|
|
+ "\n",
|
|
|
+ "for type1 in cat_gramm2:\n",
|
|
|
+ " byType[type1] = list(filter(lambda a: a[1]==type1, final))\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "lem_data_byCat = open(baseDir + 'lem_unique_byCat.csv', 'w')\n",
|
|
|
+ "csvwriter = csv.writer(lem_data_byCat)\n",
|
|
|
+ "\n",
|
|
|
+ "csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])\n",
|
|
|
+ "\n",
|
|
|
+ "for type1 in cat_gramm2:\n",
|
|
|
+ " print(type1)\n",
|
|
|
+ " for line in byType[type1]:\n",
|
|
|
+ " csvwriter.writerow(line)\n",
|
|
|
+ "\n",
|
|
|
+ "lem_data_byCat.close()"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "Microprova di modifica + export di xml (modifico un attribute di un tag).\n",
|
|
|
+ "\n",
|
|
|
+ "Come prima cosa, provo a recuperare la lista dei lemmi di un singolo file, e a rintracciare quel lemma e il suo ID (numero d'ordine) nel lemmario."
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 49,
|
|
|
+ "metadata": {
|
|
|
+ "scrolled": true
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "{'n': '6', 'type': '1'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '6', 'maggio', 's.m.', '']\n",
|
|
|
+ "New ID: 4247\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '13', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '13', 'simona (donna di piero di paolo rinaldeschi)', 'antr.', '']\n",
|
|
|
+ "New ID: 6666\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '7', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '7', 'famiglio', 's.m.', '']\n",
|
|
|
+ "New ID: 2514\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '18', 'type': '2'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '18', 'oncia', 's.f.', '']\n",
|
|
|
+ "New ID: 5069\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '20', 'type': '4'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '20', 'bottoncino', 's.m.', '']\n",
|
|
|
+ "New ID: 1255\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '22', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '22', 'infilare', 'v.', '']\n",
|
|
|
+ "New ID: 3786\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '9', 'type': '2'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '9', 'braccio', 's.m.', '']\n",
|
|
|
+ "New ID: 1269\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '19', 'type': '4'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '19', 'frangia', 's.f.', '']\n",
|
|
|
+ "New ID: 2907\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '8', 'type': '3'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '8', 'nero', 'agg./s.m.', '']\n",
|
|
|
+ "New ID: 4860\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '15', 'type': '3'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '15', 'azzurro', 'agg./s.m.', '']\n",
|
|
|
+ "New ID: 709\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '16', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '16', 'nannino (manovale)', 'antr.', '']\n",
|
|
|
+ "New ID: 4807\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '12', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '12', 'san bartolomeo (chiesa e convento di)', 'n.g.', 'a prato (convento di carmelitani)']\n",
|
|
|
+ "New ID: 6303\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '14', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '14', 'nanni di luca da santa chiara', 'antr.', '']\n",
|
|
|
+ "New ID: 4789\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '11', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '11', 'garzone', 's.m.', '']\n",
|
|
|
+ "New ID: 3021\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '21', 'type': '1'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '21', 'nona', 's.f.', '']\n",
|
|
|
+ "New ID: 5025\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '10', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '10', 'domenica', 's.f.', '']\n",
|
|
|
+ "New ID: 2369\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '3', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '3', 'sere', 's.m.', '']\n",
|
|
|
+ "New ID: 6611\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '17', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '17', 'clemente di niccolò di piero', 'antr.', '']\n",
|
|
|
+ "New ID: 1900\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '14', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '14', 'nanni di luca da santa chiara', 'antr.', '']\n",
|
|
|
+ "New ID: 4789\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '1', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '1', 'margherita di domenico bandini', 'antr.', '']\n",
|
|
|
+ "New ID: 4378\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '5', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '5', 'prato', 'n.g.', '']\n",
|
|
|
+ "New ID: 5821\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '4', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '4', 'francesco di marco datini', 'antr.', '']\n",
|
|
|
+ "New ID: 2864\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '5', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '5', 'prato', 'n.g.', '']\n",
|
|
|
+ "New ID: 5821\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '2', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '2', 'firenze', 'n.g.', '']\n",
|
|
|
+ "New ID: 2708\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '5', 'type': '0'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '5', 'prato', 'n.g.', '']\n",
|
|
|
+ "New ID: 5821\n",
|
|
|
+ "\n",
|
|
|
+ "{'n': '6', 'type': '1'}\n",
|
|
|
+ "lem\n",
|
|
|
+ "Lemma: ['d16', '6', 'maggio', 's.m.', '']\n",
|
|
|
+ "New ID: 4247\n",
|
|
|
+ "\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "ii = 2\n",
|
|
|
+ "\n",
|
|
|
+ "smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])\n",
|
|
|
+ "smallroot = smalltree.getroot()\n",
|
|
|
+ "\n",
|
|
|
+ "localLemNodes = smallroot.iter('lem')\n",
|
|
|
+ "\n",
|
|
|
+ "for node in localLemNodes:\n",
|
|
|
+ " print(node.attrib)\n",
|
|
|
+ " print(node.tag)\n",
|
|
|
+ " #\n",
|
|
|
+ " thisLemma = list(filter(lambda lem: lem[0]==gat[ii][0] and lem[1]==node.attrib['n'], redundantLemmas))[0]\n",
|
|
|
+ " newID = final.index((thisLemma[2], thisLemma[3], thisLemma[4]))\n",
|
|
|
+ " #\n",
|
|
|
+ " print('Lemma: ', thisLemma)\n",
|
|
|
+ " print('New ID: ', newID)\n",
|
|
|
+ " print()\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "Qui faccio una prova di modifica del singolo file e di export in nuovo xml"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 50,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "ii = 2\n",
|
|
|
+ "\n",
|
|
|
+ "smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])\n",
|
|
|
+ "smallroot = smalltree.getroot()\n",
|
|
|
+ "\n",
|
|
|
+ "for node in smallroot.iter('lem'):\n",
|
|
|
+ " node.set('n', '100h')\n",
|
|
|
+ " \n",
|
|
|
+ "smalltree.write(baseDir + 'prova.xml')"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "Infine provo un loop su tutti i file, li elaboro e li esporto in una nuova cartella (che va creata a mano) 'newxmlgat'"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "Parsing original xml file: xmlgat.k01.txt failed\n",
|
|
|
+ "Parsing original xml file: xmlgat.j99.txt failed\n",
|
|
|
+ "Parsing original xml file: xmlgat.c13.txt failed\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "for ii in range(len(gat)):\n",
|
|
|
+ " try:\n",
|
|
|
+ " smalltree = ET.parse(baseDir + 'xmlgat/' + gat[ii][1])\n",
|
|
|
+ " smallroot = smalltree.getroot()\n",
|
|
|
+ " #\n",
|
|
|
+ " localLemNodes = smallroot.iter('lem')\n",
|
|
|
+ " #\n",
|
|
|
+ " for node in localLemNodes:\n",
|
|
|
+ " try:\n",
|
|
|
+ " thisLemma = list(filter(lambda lem: lem[0]==gat[ii][0] and lem[1]==node.attrib['n'], redundantLemmas))[0]\n",
|
|
|
+ " newID = final.index((thisLemma[2], thisLemma[3], thisLemma[4]))\n",
|
|
|
+ " #\n",
|
|
|
+ " node.set('n', str(newID))\n",
|
|
|
+ " except Exception as e:\n",
|
|
|
+ " print('In looking for lemma:')\n",
|
|
|
+ " print(e)\n",
|
|
|
+ " try:\n",
|
|
|
+ " smalltree.write(baseDir + 'newxmlgat/'+'newxmlgat.'+gat[ii][0]+'.xml')\n",
|
|
|
+ " except Exception as e:\n",
|
|
|
+ " print('In Export:')\n",
|
|
|
+ " print(e)\n",
|
|
|
+ " except:\n",
|
|
|
+ " print('Parsing original xml file: ', gat[ii][1], ' failed' )"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": []
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "Python 3",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python3"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 3
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython3",
|
|
|
+ "version": "3.8.5"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 2
|
|
|
+}
|