{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import xml.etree.ElementTree as ET\n", "import os\n", "import csv" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "baseDir = '/home/kora/Desktop/OVI_Data_local/200_DATI_OVI/dati/'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "gat = []\n", "basepath_gat = baseDir + 'xmlgat'\n", "for entry in os.listdir(basepath_gat):\n", " if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant\n", " gg = entry.split('.')[1]\n", " if gg != '':\n", " gat.append([gg, entry])\n", "\n", "nolemmi = []\n", "basepath_nolemmi = baseDir + 'DatiniXML_incompleto'\n", "for entry in os.listdir(basepath_nolemmi):\n", " if os.path.isfile(os.path.join(basepath_nolemmi, entry)):\n", " nn = entry.split('.')[1]\n", " if nn != '':\n", " nolemmi.append([nn, entry])\n", "\n", "lemmi = []\n", "basepath_lemmi = baseDir + 'lemmi'\n", "for entry in os.listdir(basepath_lemmi):\n", " if os.path.isfile(os.path.join(basepath_lemmi, entry)):\n", " ll = entry.split('.')[1]\n", " if ll != '':\n", " lemmi.append([ll, entry])\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')\n", "root = xmlparse.getroot()\n", "biblio = root.findall(\"Biblio\")\n", "\n", "sigle = []\n", "for bib in biblio:\n", " sigla = bib.find(\"sigla\")\n", " sigle.append(sigla.text.lower())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "OVI_data = open(baseDir + 'OVI_Data.csv', 'w')\n", "csvwriter = csv.writer(OVI_data)\n", "\n", "params = [\"BiblioDatini\", \"nolemmi\", \"lemmi\", \"xmlgat\"]\n", "\n", "csvwriter.writerow(params)\n", "\n", "for sigla in sigle:\n", " row = [sigla]\n", " no_lemma = \" \"\n", " lemma = \" \"\n", " gatto = \" \"\n", " for x in range(len(nolemmi)):\n", " if sigla in nolemmi[x][0]:\n", " no_lemma = nolemmi[x][1]\n", " row.append(no_lemma)\n", " for x in range(len(lemmi)):\n", " if sigla == lemmi[x][0]:\n", " lemma = lemmi[x][1]\n", " row.append(lemma)\n", " for x in range(len(gat)):\n", " if sigla == gat[x][0]:\n", " gatto = gat[x][1]\n", " row.append(gatto)\n", " csvwriter.writerow(row)\n", "\n", "OVI_data.close()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "#Cambiare percorsi file\n", "#xml_file_name = 'Ovi/BiblioDatini.xml'\n", "#tree = Xet.parse(xml_file_name)\n", "#root = tree.getroot() --> già definito\n", "#biblio = root.findall(\"Biblio\") --> già definito\n", "\n", "\n", "Datini_data = open(baseDir + 'Datini_Data.csv', 'w')\n", "csvwriter = csv.writer(Datini_data)\n", "\n", "elemList = []\n", "\n", "for elem in root.iter():\n", " elemList.append(elem.tag)\n", "\n", "\n", "elemList = list(set(elemList))\n", "\n", "elemList.remove(\"dataroot\")\n", "elemList.remove(\"Biblio\")\n", "\n", "param = elemList\n", "\n", "csvwriter.writerow(param)\n", "\n", "\n", "def cell(p, arr):\n", " if arr.find(p) is None:\n", " res = \" \"\n", " else:\n", " res = arr.find(p).text\n", " return res\n", "\n", "\n", "for scheda in biblio:\n", " aut = []\n", "\n", " for par in param:\n", " if par == \"star_note\":\n", " if scheda.find(\".//star_note\") is None:\n", " r = \" \"\n", " else:\n", " r = \"True\"\n", " else:\n", " r = cell(par, scheda)\n", " aut.append(r)\n", "\n", " csvwriter.writerow(aut)\n", "\n", "Datini_data.close()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "\n", "iperlem_data = open(baseDir + 'lem_Data.csv', 'w')\n", "csvwriter = csv.writer(iperlem_data)\n", "\n", "params = [\"sigla\", \"file\", \"num\", \"lemma\", \"commento\", \"livello\"]\n", "csvwriter.writerow(params)\n", "\n", "\n", "def write_lines(lines, sig, file):\n", " for line in lines:\n", " row = [sig, file]\n", " lem = re.split('\\|', line)\n", " for l in lem:\n", "# if \"IPERLEMMA\" in l:\n", "# l = l.replace(\"IPERLEMMA\", \" \")\n", " m = l.strip()\n", " row.append(m)\n", "# print(row)\n", " csvwriter.writerow(row)\n", "\n", "\n", "for x in range(len(lemmi)):\n", " sigla = lemmi[x][0]\n", " file_name = lemmi[x][1]\n", " #Cambia percorso\n", " f = open(baseDir + 'lemmi/' + file_name, \"r\", encoding='latin-1')\n", " lines = f.readlines()\n", "# print(lines)\n", "# clean_lines = []\n", "# for line in lines:\n", "# if \"IPERLEMMA\" in line:\n", "# clean_lines.append(line)\n", " write_lines(lines, sigla, file_name)\n", "\n", "\n", "f.close()\n", "\n", "iperlem_data.close()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "\n", "rutto = []\n", "\n", "\n", "def write_lines_here(lines, sig):\n", " toRet = []\n", " for line in lines:\n", " row = [sig]\n", " lem = re.split('\\|', line)\n", " for l in lem:\n", "# if \"IPERLEMMA\" in l:\n", "# l = l.replace(\"IPERLEMMA\", \" \")\n", " m = l.strip()\n", " row.append(m)\n", "# print(row)\n", " toRet.append(row)\n", " return toRet\n", "\n", "\n", "for x in range(len(lemmi)):\n", " sigla = lemmi[x][0]\n", " file_name = lemmi[x][1]\n", " #Cambia percorso\n", " f = open(baseDir + 'lemmi/' + file_name, \"r\", encoding='latin-1')\n", " lines = f.readlines()\n", "# print(lines)\n", "# clean_lines = []\n", "# for line in lines:\n", "# if \"IPERLEMMA\" in line:\n", "# clean_lines.append(line)\n", " rutto = rutto + write_lines_here(lines, sigla)\n", "\n", "\n", "f.close()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "99660" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(rutto)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['c09', '1', 'balla', 's.f.', '']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rutto[0]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], rutto))\n", "prefinal = list(map(lambda row: (row[2], row[3], row[4]), preprefinal))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "82090" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(prefinal)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "final = list(set(prefinal))" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6989" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(final)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "final.sort()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "lem_data_unique = open(baseDir + 'lem_unique.csv', 'w')\n", "csvwriter = csv.writer(lem_data_unique)\n", "\n", "csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])\n", "\n", "for line in final:\n", " csvwriter.writerow(line)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'n': '4', 'type': '2'}\n", "{'n': '4', 'type': '2'}\n", "{'n': '11', 'type': '5'}\n", "{'n': '9', 'type': '4'}\n", "{'n': '9', 'type': '4'}\n", "{'n': '6', 'type': '3'}\n", "{'n': '12', 'type': '0'}\n", "{'n': '15', 'type': '0'}\n", "{'n': '13', 'type': '6'}\n", "{'n': '7', 'type': '0'}\n", "{'n': '8', 'type': '0'}\n", "{'n': '16', 'type': '1'}\n", "{'n': '10', 'type': '1'}\n", "{'n': '14', 'type': '1'}\n", "{'n': '1', 'type': '0'}\n", "{'n': '2', 'type': '0'}\n", "{'n': '5', 'type': '0'}\n", "{'n': '3', 'type': '0'}\n", "{'n': '4', 'type': '2'}\n" ] } ], "source": [ "smalltree = ET.parse(baseDir + 'xmlgat/' + gat[2][1])\n", "smallroot = smalltree.getroot()\n", "\n", "azz = smallroot.iter('lem')\n", "\n", "for node in azz:\n", " print(node.attrib)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Microprova di modifica + export di xml (modifico un attribute di un tag)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'n': '100h', 'type': '2'}\n", "{'n': '100h', 'type': '2'}\n", "{'n': '100h', 'type': '5'}\n", "{'n': '100h', 'type': '4'}\n", "{'n': '100h', 'type': '4'}\n", "{'n': '100h', 'type': '3'}\n", "{'n': '100h', 'type': '0'}\n", "{'n': '100h', 'type': '0'}\n", "{'n': '100h', 'type': '6'}\n", "{'n': '100h', 'type': '0'}\n", "{'n': '100h', 'type': '0'}\n", "{'n': '100h', 'type': '1'}\n", "{'n': '100h', 'type': '1'}\n", "{'n': '100h', 'type': '1'}\n", "{'n': '100h', 'type': '0'}\n", "{'n': '100h', 'type': '0'}\n", "{'n': '100h', 'type': '0'}\n", "{'n': '100h', 'type': '0'}\n", "{'n': '100h', 'type': '2'}\n" ] } ], "source": [ "for node in smallroot.iter('lem'):\n", " node.set('n', '100h')\n", " print(node.attrib)\n", " \n", "smalltree.write(baseDir + 'prova.xml')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }