3 years ago · b9ec60701b
--- a/OVI/ProvaLemmario.ipynb
+++ b/OVI/ProvaLemmario.ipynb
@@ -0,0 +1,487 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 1,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import xml.etree.ElementTree as ET\n",
			
 
				+    "import os\n",
			
 
				+    "import csv"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 2,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "baseDir = '/home/kora/Desktop/OVI_Data_local/200_DATI_OVI/dati/'"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 3,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "gat = []\n",
			
 
				+    "basepath_gat = baseDir + 'xmlgat'\n",
			
 
				+    "for entry in os.listdir(basepath_gat):\n",
			
 
				+    "    if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant\n",
			
 
				+    "        gg = entry.split('.')[1]\n",
			
 
				+    "        if gg != '':\n",
			
 
				+    "            gat.append([gg, entry])\n",
			
 
				+    "\n",
			
 
				+    "nolemmi = []\n",
			
 
				+    "basepath_nolemmi = baseDir + 'DatiniXML_incompleto'\n",
			
 
				+    "for entry in os.listdir(basepath_nolemmi):\n",
			
 
				+    "    if os.path.isfile(os.path.join(basepath_nolemmi, entry)):\n",
			
 
				+    "        nn = entry.split('.')[1]\n",
			
 
				+    "        if nn != '':\n",
			
 
				+    "            nolemmi.append([nn, entry])\n",
			
 
				+    "\n",
			
 
				+    "lemmi = []\n",
			
 
				+    "basepath_lemmi = baseDir + 'lemmi'\n",
			
 
				+    "for entry in os.listdir(basepath_lemmi):\n",
			
 
				+    "    if os.path.isfile(os.path.join(basepath_lemmi, entry)):\n",
			
 
				+    "        ll = entry.split('.')[1]\n",
			
 
				+    "        if ll != '':\n",
			
 
				+    "            lemmi.append([ll, entry])\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 29,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')\n",
			
 
				+    "root = xmlparse.getroot()\n",
			
 
				+    "biblio = root.findall(\"Biblio\")\n",
			
 
				+    "\n",
			
 
				+    "sigle = []\n",
			
 
				+    "for bib in biblio:\n",
			
 
				+    "    sigla = bib.find(\"sigla\")\n",
			
 
				+    "    sigle.append(sigla.text.lower())"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 5,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "OVI_data = open(baseDir + 'OVI_Data.csv', 'w')\n",
			
 
				+    "csvwriter = csv.writer(OVI_data)\n",
			
 
				+    "\n",
			
 
				+    "params = [\"BiblioDatini\", \"nolemmi\", \"lemmi\", \"xmlgat\"]\n",
			
 
				+    "\n",
			
 
				+    "csvwriter.writerow(params)\n",
			
 
				+    "\n",
			
 
				+    "for sigla in sigle:\n",
			
 
				+    "    row = [sigla]\n",
			
 
				+    "    no_lemma = \" \"\n",
			
 
				+    "    lemma = \" \"\n",
			
 
				+    "    gatto = \" \"\n",
			
 
				+    "    for x in range(len(nolemmi)):\n",
			
 
				+    "        if sigla in nolemmi[x][0]:\n",
			
 
				+    "            no_lemma = nolemmi[x][1]\n",
			
 
				+    "    row.append(no_lemma)\n",
			
 
				+    "    for x in range(len(lemmi)):\n",
			
 
				+    "        if sigla == lemmi[x][0]:\n",
			
 
				+    "            lemma = lemmi[x][1]\n",
			
 
				+    "    row.append(lemma)\n",
			
 
				+    "    for x in range(len(gat)):\n",
			
 
				+    "        if sigla == gat[x][0]:\n",
			
 
				+    "            gatto = gat[x][1]\n",
			
 
				+    "    row.append(gatto)\n",
			
 
				+    "    csvwriter.writerow(row)\n",
			
 
				+    "\n",
			
 
				+    "OVI_data.close()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 6,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "#Cambiare percorsi file\n",
			
 
				+    "#xml_file_name = 'Ovi/BiblioDatini.xml'\n",
			
 
				+    "#tree = Xet.parse(xml_file_name)\n",
			
 
				+    "#root = tree.getroot()           --> già definito\n",
			
 
				+    "#biblio = root.findall(\"Biblio\") --> già definito\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "Datini_data = open(baseDir + 'Datini_Data.csv', 'w')\n",
			
 
				+    "csvwriter = csv.writer(Datini_data)\n",
			
 
				+    "\n",
			
 
				+    "elemList = []\n",
			
 
				+    "\n",
			
 
				+    "for elem in root.iter():\n",
			
 
				+    "    elemList.append(elem.tag)\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "elemList = list(set(elemList))\n",
			
 
				+    "\n",
			
 
				+    "elemList.remove(\"dataroot\")\n",
			
 
				+    "elemList.remove(\"Biblio\")\n",
			
 
				+    "\n",
			
 
				+    "param = elemList\n",
			
 
				+    "\n",
			
 
				+    "csvwriter.writerow(param)\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "def cell(p, arr):\n",
			
 
				+    "    if arr.find(p) is None:\n",
			
 
				+    "        res = \" \"\n",
			
 
				+    "    else:\n",
			
 
				+    "        res = arr.find(p).text\n",
			
 
				+    "    return res\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "for scheda in biblio:\n",
			
 
				+    "    aut = []\n",
			
 
				+    "\n",
			
 
				+    "    for par in param:\n",
			
 
				+    "        if par == \"star_note\":\n",
			
 
				+    "            if scheda.find(\".//star_note\") is None:\n",
			
 
				+    "                r = \" \"\n",
			
 
				+    "            else:\n",
			
 
				+    "                r = \"True\"\n",
			
 
				+    "        else:\n",
			
 
				+    "            r = cell(par, scheda)\n",
			
 
				+    "        aut.append(r)\n",
			
 
				+    "\n",
			
 
				+    "    csvwriter.writerow(aut)\n",
			
 
				+    "\n",
			
 
				+    "Datini_data.close()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 8,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import re\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "iperlem_data = open(baseDir + 'lem_Data.csv', 'w')\n",
			
 
				+    "csvwriter = csv.writer(iperlem_data)\n",
			
 
				+    "\n",
			
 
				+    "params = [\"sigla\", \"file\", \"num\", \"lemma\", \"commento\", \"livello\"]\n",
			
 
				+    "csvwriter.writerow(params)\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "def write_lines(lines, sig, file):\n",
			
 
				+    "    for line in lines:\n",
			
 
				+    "        row = [sig, file]\n",
			
 
				+    "        lem = re.split('\\|', line)\n",
			
 
				+    "        for l in lem:\n",
			
 
				+    "#            if \"IPERLEMMA\" in l:\n",
			
 
				+    "#                l = l.replace(\"IPERLEMMA\", \" \")\n",
			
 
				+    "            m = l.strip()\n",
			
 
				+    "            row.append(m)\n",
			
 
				+    "#        print(row)\n",
			
 
				+    "        csvwriter.writerow(row)\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "for x in range(len(lemmi)):\n",
			
 
				+    "    sigla = lemmi[x][0]\n",
			
 
				+    "    file_name = lemmi[x][1]\n",
			
 
				+    "    #Cambia percorso\n",
			
 
				+    "    f = open(baseDir + 'lemmi/' + file_name, \"r\", encoding='latin-1')\n",
			
 
				+    "    lines = f.readlines()\n",
			
 
				+    "#    print(lines)\n",
			
 
				+    "#    clean_lines = []\n",
			
 
				+    "#    for line in lines:\n",
			
 
				+    "#        if \"IPERLEMMA\" in line:\n",
			
 
				+    "#            clean_lines.append(line)\n",
			
 
				+    "    write_lines(lines, sigla, file_name)\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "f.close()\n",
			
 
				+    "\n",
			
 
				+    "iperlem_data.close()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 9,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "\n",
			
 
				+    "rutto = []\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "def write_lines_here(lines, sig):\n",
			
 
				+    "    toRet = []\n",
			
 
				+    "    for line in lines:\n",
			
 
				+    "        row = [sig]\n",
			
 
				+    "        lem = re.split('\\|', line)\n",
			
 
				+    "        for l in lem:\n",
			
 
				+    "#            if \"IPERLEMMA\" in l:\n",
			
 
				+    "#                l = l.replace(\"IPERLEMMA\", \" \")\n",
			
 
				+    "            m = l.strip()\n",
			
 
				+    "            row.append(m)\n",
			
 
				+    "#        print(row)\n",
			
 
				+    "        toRet.append(row)\n",
			
 
				+    "    return toRet\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "for x in range(len(lemmi)):\n",
			
 
				+    "    sigla = lemmi[x][0]\n",
			
 
				+    "    file_name = lemmi[x][1]\n",
			
 
				+    "    #Cambia percorso\n",
			
 
				+    "    f = open(baseDir + 'lemmi/' + file_name, \"r\", encoding='latin-1')\n",
			
 
				+    "    lines = f.readlines()\n",
			
 
				+    "#    print(lines)\n",
			
 
				+    "#    clean_lines = []\n",
			
 
				+    "#    for line in lines:\n",
			
 
				+    "#        if \"IPERLEMMA\" in line:\n",
			
 
				+    "#            clean_lines.append(line)\n",
			
 
				+    "    rutto = rutto + write_lines_here(lines, sigla)\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "f.close()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 10,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "99660"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 10,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "len(rutto)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 11,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "['c09', '1', 'balla', 's.f.', '']"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 11,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "rutto[0]"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 20,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], rutto))\n",
			
 
				+    "prefinal = list(map(lambda row: (row[2], row[3], row[4]), preprefinal))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 21,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "82090"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 21,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "len(prefinal)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 22,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "final = list(set(prefinal))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 26,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "6989"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 26,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "len(final)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 27,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "final.sort()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 28,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "lem_data_unique = open(baseDir + 'lem_unique.csv', 'w')\n",
			
 
				+    "csvwriter = csv.writer(lem_data_unique)\n",
			
 
				+    "\n",
			
 
				+    "csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])\n",
			
 
				+    "\n",
			
 
				+    "for line in final:\n",
			
 
				+    "    csvwriter.writerow(line)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 58,
			
 
				+   "metadata": {
			
 
				+    "scrolled": true
			
 
				+   },
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "{'n': '4', 'type': '2'}\n",
			
 
				+      "{'n': '4', 'type': '2'}\n",
			
 
				+      "{'n': '11', 'type': '5'}\n",
			
 
				+      "{'n': '9', 'type': '4'}\n",
			
 
				+      "{'n': '9', 'type': '4'}\n",
			
 
				+      "{'n': '6', 'type': '3'}\n",
			
 
				+      "{'n': '12', 'type': '0'}\n",
			
 
				+      "{'n': '15', 'type': '0'}\n",
			
 
				+      "{'n': '13', 'type': '6'}\n",
			
 
				+      "{'n': '7', 'type': '0'}\n",
			
 
				+      "{'n': '8', 'type': '0'}\n",
			
 
				+      "{'n': '16', 'type': '1'}\n",
			
 
				+      "{'n': '10', 'type': '1'}\n",
			
 
				+      "{'n': '14', 'type': '1'}\n",
			
 
				+      "{'n': '1', 'type': '0'}\n",
			
 
				+      "{'n': '2', 'type': '0'}\n",
			
 
				+      "{'n': '5', 'type': '0'}\n",
			
 
				+      "{'n': '3', 'type': '0'}\n",
			
 
				+      "{'n': '4', 'type': '2'}\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "smalltree = ET.parse(baseDir + 'xmlgat/' + gat[2][1])\n",
			
 
				+    "smallroot = smalltree.getroot()\n",
			
 
				+    "\n",
			
 
				+    "azz = smallroot.iter('lem')\n",
			
 
				+    "\n",
			
 
				+    "for node in azz:\n",
			
 
				+    "    print(node.attrib)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Microprova di modifica + export di xml (modifico un attribute di un tag)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 59,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "{'n': '100h', 'type': '2'}\n",
			
 
				+      "{'n': '100h', 'type': '2'}\n",
			
 
				+      "{'n': '100h', 'type': '5'}\n",
			
 
				+      "{'n': '100h', 'type': '4'}\n",
			
 
				+      "{'n': '100h', 'type': '4'}\n",
			
 
				+      "{'n': '100h', 'type': '3'}\n",
			
 
				+      "{'n': '100h', 'type': '0'}\n",
			
 
				+      "{'n': '100h', 'type': '0'}\n",
			
 
				+      "{'n': '100h', 'type': '6'}\n",
			
 
				+      "{'n': '100h', 'type': '0'}\n",
			
 
				+      "{'n': '100h', 'type': '0'}\n",
			
 
				+      "{'n': '100h', 'type': '1'}\n",
			
 
				+      "{'n': '100h', 'type': '1'}\n",
			
 
				+      "{'n': '100h', 'type': '1'}\n",
			
 
				+      "{'n': '100h', 'type': '0'}\n",
			
 
				+      "{'n': '100h', 'type': '0'}\n",
			
 
				+      "{'n': '100h', 'type': '0'}\n",
			
 
				+      "{'n': '100h', 'type': '0'}\n",
			
 
				+      "{'n': '100h', 'type': '2'}\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "for node in smallroot.iter('lem'):\n",
			
 
				+    "    node.set('n', '100h')\n",
			
 
				+    "    print(node.attrib)\n",
			
 
				+    "        \n",
			
 
				+    "smalltree.write(baseDir + 'prova.xml')"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.8.5"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 2
			
 
				+}