2 anni fa · a9c7c14c8c
--- a/ASPO/EAD_to_CSV/EAD_to_CSV_ospedale_newTest.ipynb
+++ b/ASPO/EAD_to_CSV/EAD_to_CSV_ospedale_newTest.ipynb
@@ -0,0 +1,1322 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Inizio radunando tutti gli **IMPORT** necessari per girare il notebook, per chiarezza."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 1,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# IMPORT ESSENZIALI\n",
			
 
				+    "\n",
			
 
				+    "# Per il parsing dell'XML -- questo pacchetto è incluso anche nel più generale lxml\n",
			
 
				+    "import xml.etree.ElementTree as ET\n",
			
 
				+    "# Utilities per leggere/scrivere files csv\n",
			
 
				+    "import csv\n",
			
 
				+    "# Utilities per gestire i character encodings\n",
			
 
				+    "import unicodedata\n",
			
 
				+    "# Dizionari ordinati\n",
			
 
				+    "from collections import OrderedDict\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# IMPORT OPZIONALI\n",
			
 
				+    "\n",
			
 
				+    "# Per fare un stima della velocità delle varie istruzioni\n",
			
 
				+    "from datetime import datetime\n",
			
 
				+    "# Generatore di numeri casuali -- può sempre servire in fase di testing\n",
			
 
				+    "from random import *\n",
			
 
				+    "# Può servire per alcuni test\n",
			
 
				+    "import sys"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# FUNZIONI\n",
			
 
				+    "\n",
			
 
				+    "**ElementTree** ha una funzione built-in, **iter**, che scorre (molto velocemente) su tutti i 'nodi' dell'albero di dati che rappresenta l'XML. La funzione *iter* purtroppo però non traccia i nodi 'parents'.\n",
			
 
				+    "\n",
			
 
				+    "Ho esteso quindi la libreria scrivendo una mia versione di *iter*, **'traceElems'**, che dovrebbe riuscire a fornirci tutto quello di cui abbiamo bisogno.\n",
			
 
				+    "\n",
			
 
				+    "*traceElems* traccia tutti i nodi nell'albero tenendo conto dei 'parents', e restituisce tutti quelli per cui la funzione-argomento 'condition' ritorna True. **NON** indaga i nodi **figli** di quelli che sono restituiti."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 2,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# La funzione BASE: traceElems\n",
			
 
				+    "def traceElems(node: ET.Element, condition, parents: list = [], coords: list = []):\n",
			
 
				+    "    res = []\n",
			
 
				+    "    jj = 0\n",
			
 
				+    "    for child in node:\n",
			
 
				+    "        if condition(child):\n",
			
 
				+    "            res.append({'a_par': parents+[node],\n",
			
 
				+    "                        'coords': coords+[jj], 'child': child})\n",
			
 
				+    "        else:\n",
			
 
				+    "            res = res + traceElems(child, condition, parents+[node], coords+[jj])\n",
			
 
				+    "        jj = jj+1   \n",
			
 
				+    "    return res\n",
			
 
				+    "\n",
			
 
				+    "# Funzione-base per stoppare traceElems\n",
			
 
				+    "def isLeafOrC(aa: ET.Element):\n",
			
 
				+    "    if(aa.tag=='c' or len(aa)==0):\n",
			
 
				+    "        return True\n",
			
 
				+    "    else:\n",
			
 
				+    "        return False"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 3,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Funzioni-utilità che servono solo a visualizzare meglio i dati sul notebook.\n",
			
 
				+    "def shownode(node: ET.Element):\n",
			
 
				+    "    return (node.tag, node.attrib, node.text.replace('\\t','').replace('n','').strip() \\\n",
			
 
				+    "                               if type(node.text) is str else '')\n",
			
 
				+    "\n",
			
 
				+    "def shownodelist(el: ET.Element):\n",
			
 
				+    "    return list(map(shownode, el))\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# Utility copiata da INTERNEZZ -- versione 'multipla' del metodo str.index:\n",
			
 
				+    "def indices(lst, element):\n",
			
 
				+    "    result = []\n",
			
 
				+    "    offset = -1\n",
			
 
				+    "    while True:\n",
			
 
				+    "        try:\n",
			
 
				+    "            offset = lst.index(element, offset+1)\n",
			
 
				+    "        except ValueError:\n",
			
 
				+    "            return result\n",
			
 
				+    "        result.append(offset)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# AL LAVORO"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "**DA CAMBIARE A SECONDA DEL COMPUTER**: directory di input e output"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 4,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import_dir = '/Users/federicaspinelli/Google Drive/OVI:CNR/LAVORO 2020/SELEZIONE CONTENUTI/01_ASPO/XDAMS/'\n",
			
 
				+    "export_dir = '/Users/federicaspinelli/Google Drive/OVI:CNR/CSV/ASPO/ospedale/'"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Importo il file XML del Datini, tracciando il tempo necessario"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 5,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "1.052720069885254\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "treeDatini = ET.parse(import_dir + 'export_aspoSt005--ospedale.xml')\n",
			
 
				+    "rootDatini = treeDatini.getroot()\n",
			
 
				+    "\n",
			
 
				+    "ts2 = datetime.timestamp(datetime.now())\n",
			
 
				+    "print(ts2 - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Uso *iter* per trovare tutti i nodi con label **'c'** nel file Datini, e mi faccio restituire il\n",
			
 
				+    "valore dell'attributo **'level'**; salvo tutti i *levels* nella variabile **cLevs**"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 6,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "{'subseries', 'series', 'file', 'subgrp', 'otherlevel', 'recordgrp', 'collection', 'fonds', 'subfonds'}\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "cLevs = set(map(lambda a : a.attrib['level'], rootDatini.iter('c')))\n",
			
 
				+    "print(cLevs)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "A questo punto metto al lavoro la funzione **traceElems**: registro TUTTI i nodi **'c'** dividendoli in base all'attributo **'level'**; mi faccio stampare il numero di elementi per ogni livello ed il tempo trascorso.\n",
			
 
				+    "\n",
			
 
				+    "**OCCHIO:** per come è costruita, questa routine non va ad investigare dentro i livelli restituiti -- quindi si perde eventuali sotto-livelli con la stessa label di quelli che trova durante il primo scan. La presenza di sotto-livelli di questo tipo va controllata separatamente."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 7,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "# di tag \"c\", livello subseries, primo passaggio: 151\n",
			
 
				+      "# di tag \"c\", livello series, primo passaggio: 254\n",
			
 
				+      "# di tag \"c\", livello file, primo passaggio: 7199\n",
			
 
				+      "# di tag \"c\", livello subgrp, primo passaggio: 10\n",
			
 
				+      "# di tag \"c\", livello otherlevel, primo passaggio: 321\n",
			
 
				+      "# di tag \"c\", livello recordgrp, primo passaggio: 7\n",
			
 
				+      "# di tag \"c\", livello collection, primo passaggio: 1\n",
			
 
				+      "# di tag \"c\", livello fonds, primo passaggio: 1\n",
			
 
				+      "# di tag \"c\", livello subfonds, primo passaggio: 3\n",
			
 
				+      "\n",
			
 
				+      "Tempo trascorso: 0.7257428169250488\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "allCs = {}\n",
			
 
				+    "\n",
			
 
				+    "for label in cLevs:\n",
			
 
				+    "    def tempFilt(aa: ET.Element):\n",
			
 
				+    "        if(aa.tag=='c' and aa.attrib['level']==label):\n",
			
 
				+    "            return True\n",
			
 
				+    "        else:\n",
			
 
				+    "            return False\n",
			
 
				+    "       \n",
			
 
				+    "    allCs[label] = traceElems(rootDatini, tempFilt);\n",
			
 
				+    "    print('# di tag \"c\", livello ' + label + ', primo passaggio:', len(allCs[label]))\n",
			
 
				+    "\n",
			
 
				+    "print()\n",
			
 
				+    "print('Tempo trascorso:', datetime.timestamp(datetime.now()) - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Notare che l'elaborazione è piuttosto veloce (sul mio laptop) malgrado la dimensione del file.\n",
			
 
				+    "\n",
			
 
				+    "Rimane il problema dei livelli dentro a livelli omonimi. Vediamo di affrontarlo."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 8,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "# di tag \"c\", livello subseries, primo passaggio: 151\n",
			
 
				+      "# di tag \"c\", livello subseries, totali: 163\n",
			
 
				+      "# di tag \"c\", livello series, primo passaggio: 254\n",
			
 
				+      "# di tag \"c\", livello series, totali: 254\n",
			
 
				+      "# di tag \"c\", livello file, primo passaggio: 7199\n",
			
 
				+      "# di tag \"c\", livello file, totali: 7199\n",
			
 
				+      "# di tag \"c\", livello subgrp, primo passaggio: 10\n",
			
 
				+      "# di tag \"c\", livello subgrp, totali: 10\n",
			
 
				+      "# di tag \"c\", livello otherlevel, primo passaggio: 321\n",
			
 
				+      "# di tag \"c\", livello otherlevel, totali: 321\n",
			
 
				+      "# di tag \"c\", livello recordgrp, primo passaggio: 7\n",
			
 
				+      "# di tag \"c\", livello recordgrp, totali: 7\n",
			
 
				+      "# di tag \"c\", livello collection, primo passaggio: 1\n",
			
 
				+      "# di tag \"c\", livello collection, totali: 1\n",
			
 
				+      "# di tag \"c\", livello fonds, primo passaggio: 1\n",
			
 
				+      "# di tag \"c\", livello fonds, totali: 11\n",
			
 
				+      "# di tag \"c\", livello subfonds, primo passaggio: 3\n",
			
 
				+      "# di tag \"c\", livello subfonds, totali: 3\n",
			
 
				+      "\n",
			
 
				+      "Tempo trascorso: 1.1849439144134521\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "allCs2 = {}\n",
			
 
				+    "\n",
			
 
				+    "for label in cLevs:\n",
			
 
				+    "    partial = allCs[label]\n",
			
 
				+    "    print('# di tag \"c\", livello ' + label + ', primo passaggio:', len(partial))\n",
			
 
				+    "    allCs2[label] = partial\n",
			
 
				+    "    partialUpdate = []\n",
			
 
				+    "    while True:\n",
			
 
				+    "        def tempFilt(aa: ET.Element):\n",
			
 
				+    "            if(aa.tag=='c' and aa.attrib['level']==label):\n",
			
 
				+    "                 return True\n",
			
 
				+    "            else:\n",
			
 
				+    "                 return False\n",
			
 
				+    "        for node in partial:\n",
			
 
				+    "            partialUpdate = partialUpdate + traceElems(node['child'], tempFilt)\n",
			
 
				+    "        #print(len(partialUpdate))\n",
			
 
				+    "        partial = partialUpdate\n",
			
 
				+    "        if(len(partialUpdate)==0):\n",
			
 
				+    "            break\n",
			
 
				+    "        allCs2[label] = allCs2[label] + partial\n",
			
 
				+    "        partialUpdate = []\n",
			
 
				+    "\n",
			
 
				+    "    print('# di tag \"c\", livello ' + label + ', totali:', len(allCs2[label]))\n",
			
 
				+    "\n",
			
 
				+    "print()\n",
			
 
				+    "print('Tempo trascorso:', datetime.timestamp(datetime.now()) - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "A questo punto diventa facile visualizzare tutti i dettagli dei vari elementi **'c'**, di qualunque livello; un esempio è fornito nella prossima cella. Si può cambiare l'elemento da visualizzare cambiando il valore delle variabili *ii* e *level*"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 9,
			
 
				+   "metadata": {
			
 
				+    "tags": []
			
 
				+   },
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "\n",
			
 
				+      "\n",
			
 
				+      "Level: otherlevel\n",
			
 
				+      "#: 1\n",
			
 
				+      "\n",
			
 
				+      "[('c', {'level': 'otherlevel', 'otherlevel': 'subfile', 'id': 'IT-ASPO-ST00005-0002653', 'audience': 'external'}, ''), ('did', {}, ''), ('unittitle', {'encodinganalog': 'ISAD 3-1-2 title'}, '\"Spedale dei gettatelli i Prato. Allegati al Redimeto di coti dell\\'ao 1836\"')]\n",
			
 
				+      "[0, 0, 0]\n",
			
 
				+      "('unitdate', {'encodinganalog': 'ISAD 3-1-3 date(s)', 'normal': '18360101-18361231'}, '1836')\n",
			
 
				+      "# of children: 0\n",
			
 
				+      "\n",
			
 
				+      "[('c', {'level': 'otherlevel', 'otherlevel': 'subfile', 'id': 'IT-ASPO-ST00005-0002653', 'audience': 'external'}, ''), ('did', {}, ''), ('unittitle', {'encodinganalog': 'ISAD 3-1-2 title'}, '\"Spedale dei gettatelli i Prato. Allegati al Redimeto di coti dell\\'ao 1836\"')]\n",
			
 
				+      "[0, 0, 1]\n",
			
 
				+      "('num', {'type': 'nuovo ordinamento'}, '2')\n",
			
 
				+      "# of children: 0\n",
			
 
				+      "\n",
			
 
				+      "[('c', {'level': 'otherlevel', 'otherlevel': 'subfile', 'id': 'IT-ASPO-ST00005-0002653', 'audience': 'external'}, ''), ('did', {}, ''), ('unitid', {'encodinganalog': 'ISAD 3-1-1 reference code'}, '')]\n",
			
 
				+      "[0, 1, 0]\n",
			
 
				+      "('extref', {'role': 'id_arianna'}, '2598')\n",
			
 
				+      "# of children: 0\n",
			
 
				+      "\n",
			
 
				+      "[('c', {'level': 'otherlevel', 'otherlevel': 'subfile', 'id': 'IT-ASPO-ST00005-0002653', 'audience': 'external'}, ''), ('did', {}, ''), ('physdesc', {'encodinganalog': 'ISAD 3-1-5 extent and medium of the unit of description'}, '')]\n",
			
 
				+      "[0, 2, 0]\n",
			
 
				+      "('extent', {}, 'cc. ..')\n",
			
 
				+      "# of children: 0\n",
			
 
				+      "\n",
			
 
				+      "[('c', {'level': 'otherlevel', 'otherlevel': 'subfile', 'id': 'IT-ASPO-ST00005-0002653', 'audience': 'external'}, ''), ('did', {}, ''), ('physdesc', {'encodinganalog': 'ISAD 3-1-5 extent and medium of the unit of description'}, '')]\n",
			
 
				+      "[0, 2, 1]\n",
			
 
				+      "('genreform', {}, 'registro')\n",
			
 
				+      "# of children: 0\n",
			
 
				+      "\n",
			
 
				+      "[('c', {'level': 'otherlevel', 'otherlevel': 'subfile', 'id': 'IT-ASPO-ST00005-0002653', 'audience': 'external'}, ''), ('did', {}, ''), ('physdesc', {'encodinganalog': 'ISAD 3-1-5 extent and medium of the unit of description'}, '')]\n",
			
 
				+      "[0, 2, 2]\n",
			
 
				+      "('physfacet', {'type': 'supporto'}, 'cartaceo')\n",
			
 
				+      "# of children: 0\n",
			
 
				+      "\n",
			
 
				+      "[('c', {'level': 'otherlevel', 'otherlevel': 'subfile', 'id': 'IT-ASPO-ST00005-0002653', 'audience': 'external'}, ''), ('did', {}, ''), ('physdesc', {'encodinganalog': 'ISAD 3-1-5 extent and medium of the unit of description'}, '')]\n",
			
 
				+      "[0, 2, 3]\n",
			
 
				+      "('physfacet', {'type': 'note'}, 'Legaturaï¿½origialeï¿½iï¿½cartoe')\n",
			
 
				+      "# of children: 0\n",
			
 
				+      "\n",
			
 
				+      "[('c', {'level': 'otherlevel', 'otherlevel': 'subfile', 'id': 'IT-ASPO-ST00005-0002653', 'audience': 'external'}, ''), ('processinfo', {}, ''), ('list', {}, ''), ('item', {}, 'iserimeto i Ariaa 3.2')]\n",
			
 
				+      "[1, 0, 0, 0]\n",
			
 
				+      "('persname', {}, 'utete Ariaa')\n",
			
 
				+      "# of children: 0\n",
			
 
				+      "\n",
			
 
				+      "[('c', {'level': 'otherlevel', 'otherlevel': 'subfile', 'id': 'IT-ASPO-ST00005-0002653', 'audience': 'external'}, ''), ('processinfo', {}, ''), ('list', {}, ''), ('item', {}, 'iserimeto i Ariaa 3.2')]\n",
			
 
				+      "[1, 0, 0, 1]\n",
			
 
				+      "('date', {}, '05-02-2007')\n",
			
 
				+      "# of children: 0\n",
			
 
				+      "\n",
			
 
				+      "[('c', {'level': 'otherlevel', 'otherlevel': 'subfile', 'id': 'IT-ASPO-ST00005-0002653', 'audience': 'external'}, ''), ('processinfo', {}, ''), ('list', {}, ''), ('item', {}, 'coversioe da Ariaa 3.2')]\n",
			
 
				+      "[1, 0, 1, 0]\n",
			
 
				+      "('persname', {}, 'admiistrator Regesta.exe')\n",
			
 
				+      "# of children: 0\n",
			
 
				+      "\n",
			
 
				+      "[('c', {'level': 'otherlevel', 'otherlevel': 'subfile', 'id': 'IT-ASPO-ST00005-0002653', 'audience': 'external'}, ''), ('processinfo', {}, ''), ('list', {}, ''), ('item', {}, 'coversioe da Ariaa 3.2')]\n",
			
 
				+      "[1, 0, 1, 1]\n",
			
 
				+      "('date', {}, '13-03-2013')\n",
			
 
				+      "# of children: 0\n",
			
 
				+      "\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ii = 1\n",
			
 
				+    "level = 'otherlevel'\n",
			
 
				+    "test = allCs2[level][ii]\n",
			
 
				+    "toProc = traceElems(test['child'], isLeafOrC)\n",
			
 
				+    "\n",
			
 
				+    "# Commentare/scommentare per stampare qui / su file\n",
			
 
				+    "# (vedi anche in fondo alla cella)\n",
			
 
				+    "#provaFileName = 'out.txt'\n",
			
 
				+    "#orig_stdout = sys.stdout\n",
			
 
				+    "#fp = open(export_dir + provaFileName, 'w')\n",
			
 
				+    "#sys.stdout = fp\n",
			
 
				+    "# fino qui + in fondo\n",
			
 
				+    "\n",
			
 
				+    "print()\n",
			
 
				+    "print()\n",
			
 
				+    "print('Level:', level)\n",
			
 
				+    "print('#:', ii)\n",
			
 
				+    "print()\n",
			
 
				+    "for node in toProc:\n",
			
 
				+    "    print(shownodelist(node['a_par']))\n",
			
 
				+    "    print(node['coords'])\n",
			
 
				+    "    print(shownode(node['child']))\n",
			
 
				+    "    print('# of children:', len(node['child']))\n",
			
 
				+    "    print()\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# Commentare/scommentare per stampare qui / su file\n",
			
 
				+    "#sys.stdout = orig_stdout\n",
			
 
				+    "#fp.close()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "(*NOTA X ME:* **'did' = 'Descriptive IDentification'**)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "A questo punto, quello che devo fare è scrivere un **traduttore** -- una funzione che scorra l'output degli elementi esaminati e trasformi le info in modo da poterle esportare in formato csv (o in qualunque altro formato vogliamo).\n",
			
 
				+    "\n",
			
 
				+    "La mia attuale versione di **traduttore per gli item** è data nella prossima cella; accetta come argomento un nodo (che è supposto essere di tipo item) e restituisce un dict."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 10,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "def traduttoreItem(elem):\n",
			
 
				+    "    # Variabile che contiene l'output della traduzione:\n",
			
 
				+    "    csvProt = {}\n",
			
 
				+    "\n",
			
 
				+    "    # Processo i nodi-parent di 'elem'\n",
			
 
				+    "    par_tags = list(map(lambda a: a.tag, elem['a_par']))\n",
			
 
				+    "    par_attributes = list(map(lambda a: a.attrib, elem['a_par']))\n",
			
 
				+    "    \n",
			
 
				+    "    # e0: Le varie id dei nodi parent\n",
			
 
				+    "    for ii in indices(par_tags, 'c'):\n",
			
 
				+    "        key = 'id_' + par_attributes[ii]['level']\n",
			
 
				+    "        csvProt[key] = par_attributes[ii]['id']\n",
			
 
				+    "\n",
			
 
				+    "    # Processo i nodi-child di 'elem'\n",
			
 
				+    "    toProc = traceElems(elem['child'], isLeafOrC)\n",
			
 
				+    "    first = True\n",
			
 
				+    "    for node in toProc:\n",
			
 
				+    "        tags = list(map(lambda a: a.tag, node['a_par'])) + [node['child'].tag]\n",
			
 
				+    "        attributes = list(map(lambda a: a.attrib, node['a_par'])) + [node['child'].attrib]\n",
			
 
				+    "        content = node['child'].text\n",
			
 
				+    "\n",
			
 
				+    "        # Da controllare solo per il primo nodo\n",
			
 
				+    "        # (informazioni a livello del nodo, uguali per tutti i figli)\n",
			
 
				+    "        if(first):\n",
			
 
				+    "            # e1 ID della item\n",
			
 
				+    "            csvProt['id'] = attributes[tags.index('c')]['id']\n",
			
 
				+    "            # e2 Audience: external o internal\n",
			
 
				+    "            try:\n",
			
 
				+    "                csvProt['audience'] = attributes[tags.index('c')]['audience']\n",
			
 
				+    "            except:\n",
			
 
				+    "                pass\n",
			
 
				+    "            # e3 Otherlevel\n",
			
 
				+    "            try:\n",
			
 
				+    "                csvProt['altro_livello'] = attributes[tags.index('c')]['otherlevel']\n",
			
 
				+    "            except:\n",
			
 
				+    "                pass\n",
			
 
				+    "            first = False\n",
			
 
				+    "\n",
			
 
				+    "        # La 'ciccia': si processa il contenuto vero e proprio\n",
			
 
				+    "        # e4 Repository (qui dovrebbe essere sempre l'Archivio di Prato)\n",
			
 
				+    "        if('repository' in tags):\n",
			
 
				+    "            csvProt['repository'] = content  \n",
			
 
				+    "\n",
			
 
				+    "        # e8 Tipologia\n",
			
 
				+    "        try:\n",
			
 
				+    "            ii = tags.index('materialspec')\n",
			
 
				+    "            if(attributes[ii]['label']=='tipologia'): \n",
			
 
				+    "                csvProt['tipologia'] = content\n",
			
 
				+    "        except:\n",
			
 
				+    "            pass\n",
			
 
				+    "\n",
			
 
				+    "        # e9 Segnatura attuale\n",
			
 
				+    "        try:\n",
			
 
				+    "            ii = tags.index('num')\n",
			
 
				+    "            type1 = attributes[ii]['type']\n",
			
 
				+    "            if(type1=='nuovo ordinamento'):\n",
			
 
				+    "               csvProt['segnatura_attuale'] = content\n",
			
 
				+    "        except:\n",
			
 
				+    "            pass\n",
			
 
				+    "        # e9 Segnatura precedente (Odd)\n",
			
 
				+    "        if('odd' in tags):\n",
			
 
				+    "            csvProt['segnatura_precedente'] = content       \n",
			
 
				+    "        \n",
			
 
				+    "        # e11 Il titolo da unittitle\n",
			
 
				+    "        try:\n",
			
 
				+    "            aa = csvProt['titolo_aspo']\n",
			
 
				+    "        except:\n",
			
 
				+    "            try:\n",
			
 
				+    "                ii = tags.index('unittitle')\n",
			
 
				+    "                try:\n",
			
 
				+    "                    csvProt['titolo_aspo'] = str(node['a_par'][ii].text).replace('\\t','').replace('\\n','').strip()\n",
			
 
				+    "                except:\n",
			
 
				+    "                    csvProt['titolo_aspo'] = str(content).replace('\\t','').replace('\\n','').strip()\n",
			
 
				+    "            except:\n",
			
 
				+    "                pass\n",
			
 
				+    "        \n",
			
 
				+    "        # e12 Scope-content head & body\n",
			
 
				+    "        if('scopecontent' in tags):\n",
			
 
				+    "            if('p' in tags):\n",
			
 
				+    "                csvProt['scope-content_body'] = content\n",
			
 
				+    "\n",
			
 
				+    "        # e14 Nome della compagnia\n",
			
 
				+    "        try:\n",
			
 
				+    "            ii = tags.index('corpname')\n",
			
 
				+    "            if(attributes[ii]['authfilenumber']):\n",
			
 
				+    "                try:\n",
			
 
				+    "                    authId = attributes[ii]['authfilenumber']\n",
			
 
				+    "                    csvProt['compagnia'] = '{\"nome\": ' + \"\\\"\" + content + \"\\\"\" + ', \"authID\": ' + \"\\\"\" + authId + \"\\\"\" + '}'\n",
			
 
				+    "                except:\n",
			
 
				+    "                    csvProt['compagnia'] = '{\"nome\": ' + \"\\\"\" + content + \"\\\"\" + '}'\n",
			
 
				+    "        except:\n",
			
 
				+    "            pass\n",
			
 
				+    "        \n",
			
 
				+    "        # e16 Persona\n",
			
 
				+    "        if ('controlaccess' in tags):\n",
			
 
				+    "            try: \n",
			
 
				+    "                ii=tags.index('persname')       \n",
			
 
				+    "                key='persona'  \n",
			
 
				+    "                authId = attributes[ii]['authfilenumber']\n",
			
 
				+    "                try:       \n",
			
 
				+    "                    csvProt[key] = csvProt[key] + ' | {\"nome\": ' + \"\\\"\" + content + \"\\\"\" + ', \"authID\": ' + \"\\\"\" + authId + \"\\\"\" +'}'\n",
			
 
				+    "                except:                         \n",
			
 
				+    "                    csvProt[key] = '{\"nome\": ' + \"\\\"\" + content + \"\\\"\" + ', \"authID\": ' + \"\\\"\" + authId + \"\\\"\" +'}'\n",
			
 
				+    "            except:                \n",
			
 
				+    "                try:\n",
			
 
				+    "                    csvProt[key] = csvProt[key] + ' | {\"nome\": ' + \"\\\"\" + content + \"\\\"\" + '}'\n",
			
 
				+    "                except:\n",
			
 
				+    "                    csvProt[key] = '{\"nome\": ' + \"\\\"\" + content + \"\\\"\" + '}'\n",
			
 
				+    "\n",
			
 
				+    "        # e17 Date\n",
			
 
				+    "        if ('unittitle' in tags):\n",
			
 
				+    "            try:\n",
			
 
				+    "                ii = tags.index('date')\n",
			
 
				+    "                key = 'data'\n",
			
 
				+    "                csvProt[key] = content\n",
			
 
				+    "            except:\n",
			
 
				+    "                pass\n",
			
 
				+    "       \n",
			
 
				+    "        # e18 Data 1: periodo\n",
			
 
				+    "        if('unitdate' in tags):\n",
			
 
				+    "            csvProt['data_periodo'] = content\n",
			
 
				+    "        \n",
			
 
				+    "        # e20 Supporto fisico\n",
			
 
				+    "        try:\n",
			
 
				+    "            ii = tags.index('physfacet')\n",
			
 
				+    "            if(attributes[ii]['type']=='supporto'):\n",
			
 
				+    "                csvProt['supporto'] = content\n",
			
 
				+    "        except:\n",
			
 
				+    "            pass\n",
			
 
				+    "\n",
			
 
				+    "        # e21 Physdesc \n",
			
 
				+    "        if('extent' in tags):\n",
			
 
				+    "            csvProt['numero'] = content\n",
			
 
				+    "        if('genreform' in tags):\n",
			
 
				+    "            csvProt['genere'] = content\n",
			
 
				+    "        \n",
			
 
				+    "        # e21 Dimensions\n",
			
 
				+    "        try:\n",
			
 
				+    "            ii = tags.index('dimensions')\n",
			
 
				+    "            try:\n",
			
 
				+    "                csvProt['dimensione_altezza_larghezza_spessore'] = csvProt['dimensione_altezza_larghezza_spessore'] + ' | ' + content\n",
			
 
				+    "            except:\n",
			
 
				+    "                csvProt['dimensione_altezza_larghezza_spessore'] = content\n",
			
 
				+    "        except:\n",
			
 
				+    "            pass\n",
			
 
				+    "        \n",
			
 
				+    "        # e22 Phystech  \n",
			
 
				+    "        try:\n",
			
 
				+    "            ii = tags.index('phystech')\n",
			
 
				+    "            try:\n",
			
 
				+    "                csvProt['conservazione'] = csvProt['conservazione'] + ' | ' + content\n",
			
 
				+    "            except:\n",
			
 
				+    "                csvProt['conservazione'] = content\n",
			
 
				+    "        except:\n",
			
 
				+    "            pass\n",
			
 
				+    "        \n",
			
 
				+    "        # e24 Note\n",
			
 
				+    "        if('note' in tags):\n",
			
 
				+    "            csvProt['nota'] = content\n",
			
 
				+    "        \n",
			
 
				+    "        # e26 Oggetto digitale allegato (nome)\n",
			
 
				+    "        try:\n",
			
 
				+    "            ii = tags.index('daoloc')\n",
			
 
				+    "            out = attributes[ii]['title']\n",
			
 
				+    "            try:\n",
			
 
				+    "                csvProt['oggetto_digitale'] = csvProt['oggetto_digitale'] + ' | ' + out\n",
			
 
				+    "            except:\n",
			
 
				+    "                csvProt['oggetto_digitale'] = out\n",
			
 
				+    "        except:\n",
			
 
				+    "            pass\n",
			
 
				+    "\n",
			
 
				+    "    return csvProt\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "# Di pari passo alla funzione, definisco un dict contenente tutti gli header;\n",
			
 
				+    "# servirà per il CSV.\n",
			
 
				+    "itemHeader = OrderedDict()\n",
			
 
				+    "\n",
			
 
				+    "# e1 ID dell'entità\n",
			
 
				+    "itemHeader.update({'id': '<c level=\"X\" id=#>'})\n",
			
 
				+    "\n",
			
 
				+    "# e2 Audience: external o internal\n",
			
 
				+    "itemHeader.update({'audience': '<c level=\"#\" audience=#>'})\n",
			
 
				+    "\n",
			
 
				+    "# e3 Otherlevel\n",
			
 
				+    "itemHeader.update({'altro_livello': '<c otherlevel=#>'})\n",
			
 
				+    "\n",
			
 
				+    "# e4 Repository (qui dovrebbe essere sempre l'Archivio di Prato)\n",
			
 
				+    "itemHeader.update({'repository': '<repository>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e8 Tipologia\n",
			
 
				+    "itemHeader.update({'tipologia': '<materialspec label=\"tipologia\">#'})\n",
			
 
				+    "\n",
			
 
				+    "# e9 Segnatura attuale\n",
			
 
				+    "itemHeader.update({'segnatura_attuale': '<num type=\"nuovo ordinamento\">#'})\n",
			
 
				+    "\n",
			
 
				+    "# e9 Segnatura precedente\n",
			
 
				+    "itemHeader.update({'segnatura_precedente': '<odd>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e11 Titolo ASPO\n",
			
 
				+    "itemHeader.update({'titolo_aspo': '<unittitle>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e12 Scope content, head & body\n",
			
 
				+    "itemHeader.update(\n",
			
 
				+    "{'scope-content_head': '<scopecontent><head>#',\n",
			
 
				+    " 'scope-content_body': '<scopecontent><p>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e14 Nome della compagnia\n",
			
 
				+    "itemHeader.update({'compagnia': '<corpname>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e15 Soggetto\n",
			
 
				+    "itemHeader.update({'soggetto': '<subject>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e16 Persona\n",
			
 
				+    "itemHeader.update({'persona': '<persname authfilenumber=#>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e17 Date\n",
			
 
				+    "itemHeader.update(\n",
			
 
				+    "{'data': '<date>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e18 Data 1: periodo\n",
			
 
				+    "itemHeader.update({'data_periodo': '<unitdate>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e20 Supporto fisico\n",
			
 
				+    "itemHeader.update({'supporto': '<physfacet type=\"supporto\">#'})\n",
			
 
				+    "\n",
			
 
				+    "# e21 descrizione fisica\n",
			
 
				+    "itemHeader.update({'numero': '<extent>#'})\n",
			
 
				+    "itemHeader.update({'genere': '<genreform>#'})\n",
			
 
				+    "# e21 dimensions\n",
			
 
				+    "itemHeader.update({'dimensione_altezza_larghezza_spessore': '<dimensions>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e22 Phystech\n",
			
 
				+    "itemHeader.update({'conservazione': '<phystech>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e23 Consistenza\n",
			
 
				+    "itemHeader.update({'consistenza': '<extent unit=#1>#2, #1: #2'})\n",
			
 
				+    "\n",
			
 
				+    "# 24 Note\n",
			
 
				+    "itemHeader.update({'nota': '<note>#'})\n",
			
 
				+    "\n",
			
 
				+    "# e26 Oggetto digitale allegato (nome)\n",
			
 
				+    "itemHeader.update({'oggetto_digitale': '<daoloc title=#>'})\n",
			
 
				+    "\n",
			
 
				+    "#0: Le varie id dei nodi parent\n",
			
 
				+    "itemHeader.update(\n",
			
 
				+    "{'id_subfonds': '<c level=\"subfonds\" id=#>',\n",
			
 
				+    " 'id_fonds': '<c level=\"fonds\" id=#>',\n",
			
 
				+    " 'id_series': '<c level=\"series\" id=#>',\n",
			
 
				+    " 'id_subseries': '<c level=\"subseries\" id=#>',\n",
			
 
				+    " 'id_recordgrp': '<c level=\"recordgrp\" id=#>',\n",
			
 
				+    " 'id_otherlevel': '<c level=\"otherlevel\" id=# otherlevel=\"subfile\">',\n",
			
 
				+    " 'id_collection': '<c level=\"collection\" id=#>',\n",
			
 
				+    " 'id_subgrp': '<c level=\"subgrp\" id=#>',\n",
			
 
				+    " 'id_collection': '<c level=\"collection\" id=#>',\n",
			
 
				+    " 'id_file': '<c level=\"file\" id=#>'})"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Test della funzione traduttore\n",
			
 
				+    "\n",
			
 
				+    "**NB:** l'ho definita basandomi sugli item, ma sembra funzionare decentemente anche sugli altri livelli!"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 11,
			
 
				+   "metadata": {
			
 
				+    "tags": []
			
 
				+   },
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "id_fonds: IT-ASPO-ST00005-0000002\n",
			
 
				+      "\n",
			
 
				+      "id_series: IT-ASPO-ST00005-0000003\n",
			
 
				+      "\n",
			
 
				+      "id: IT-ASPO-ST00005-0000004\n",
			
 
				+      "\n",
			
 
				+      "audience: external\n",
			
 
				+      "\n",
			
 
				+      "titolo_aspo: \"N.2 1420. Testamento di mona Margherita Buri moglie di Paolo Saccagnini con obbligo a' suoi figliuoli eredi di fare ogni anno in S. Francesco la festa di S. Antonio da Padova\"\n",
			
 
				+      "\n",
			
 
				+      "data_periodo: 1420ï¿½nov.ï¿½19\n",
			
 
				+      "\n",
			
 
				+      "segnatura_attuale: 1\n",
			
 
				+      "\n",
			
 
				+      "genere: pergamena\n",
			
 
				+      "\n",
			
 
				+      "dimensione_altezza_larghezza_spessore: 370 | 255 | 20\n",
			
 
				+      "\n",
			
 
				+      "persona: {\"nome\": \"Buri Margherita di Bartolomeo\", \"authID\": \"IT-ASPO-AU00002-0001222\"} | {\"nome\": \"Saccagnini Paolo di Vanni da Pratolino\", \"authID\": \"IT-ASPO-AU00002-0001014\"} | {\"nome\": \"Bandini Paolo di ser Vannozzo\", \"authID\": \"IT-ASPO-AU00002-0001330\"}\n",
			
 
				+      "\n",
			
 
				+      "scope-content_body: Atto rogato da Paolo di ser Vannozzo Bandini da Prato.\n",
			
 
				+      "\n",
			
 
				+      "conservazione: buona\n",
			
 
				+      "\n",
			
 
				+      "segnatura_precedente: 2996/5\n",
			
 
				+      "\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "test = allCs2['file'][0]\n",
			
 
				+    "toShow = traduttoreItem(test)\n",
			
 
				+    "for key in toShow.keys():\n",
			
 
				+    "    print(key + ': ' + str(toShow[key]))\n",
			
 
				+    "    print()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Export\n",
			
 
				+    "\n",
			
 
				+    "Produciamo il CSV per gli item tracciando, al solito, il tempo impiegato."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 12,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Tempo trascorso: 1.894237995147705\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "# Do it! Export del CSV - items.\n",
			
 
				+    "\n",
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "# Apro il file per l'export\n",
			
 
				+    "with open(export_dir + \"data_file.csv\", \"w\", newline=\"\") as csv_file:\n",
			
 
				+    "    # Definisco la classe-motore per l'export\n",
			
 
				+    "    writer = csv.DictWriter(csv_file, fieldnames=list(itemHeader.keys()))\n",
			
 
				+    "    # Scrivo l'intestazione\n",
			
 
				+    "    writer.writeheader()\n",
			
 
				+    "    # Scrivo la seconda riga, esplicativa\n",
			
 
				+    "    writer.writerow(itemHeader)\n",
			
 
				+    "    # Scrivo gli item tradotti, uno a uno\n",
			
 
				+    "    for ii in range(len(allCs2['file'])):\n",
			
 
				+    "        test = allCs2['file'][ii]\n",
			
 
				+    "        writer.writerow(traduttoreItem(test))\n",
			
 
				+    "\n",
			
 
				+    "print('Tempo trascorso:', datetime.timestamp(datetime.now()) - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Altri livelli"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Definisco un dizionario ridotto per l'header delle *subseries*, poi esporto -- per il momento con lo stesso traduttore usato per gli *item*"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 13,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Tempo trascorso: 0.009209156036376953\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "subSeriesKeys = set()\n",
			
 
				+    "for ii in range(len(allCs2['subgrp'])):\n",
			
 
				+    "    test = allCs2['subgrp'][ii]\n",
			
 
				+    "    subSeriesKeys = subSeriesKeys.union( traduttoreItem(test).keys() )\n",
			
 
				+    "\n",
			
 
				+    "subSeriesHeader = OrderedDict()\n",
			
 
				+    "for key in itemHeader:\n",
			
 
				+    "    if(key in subSeriesKeys):\n",
			
 
				+    "        subSeriesHeader[key] = itemHeader[key]\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "with open(export_dir + \"data_subgrp.csv\", \"w\", newline=\"\") as csv_file:\n",
			
 
				+    "    writer = csv.DictWriter(csv_file, fieldnames=list(subSeriesHeader.keys()))\n",
			
 
				+    "    writer.writeheader()\n",
			
 
				+    "    writer.writerow(subSeriesHeader)\n",
			
 
				+    "    for ii in range(len(allCs2['subgrp'])):\n",
			
 
				+    "        test = allCs2['subgrp'][ii]\n",
			
 
				+    "        writer.writerow(traduttoreItem(test))\n",
			
 
				+    "\n",
			
 
				+    "print('Tempo trascorso:', datetime.timestamp(datetime.now()) - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 14,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Tempo trascorso: 0.14556884765625\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "subSeriesKeys = set()\n",
			
 
				+    "for ii in range(len(allCs2['subseries'])):\n",
			
 
				+    "    test = allCs2['subseries'][ii]\n",
			
 
				+    "    subSeriesKeys = subSeriesKeys.union( traduttoreItem(test).keys() )\n",
			
 
				+    "\n",
			
 
				+    "subSeriesHeader = OrderedDict()\n",
			
 
				+    "for key in itemHeader:\n",
			
 
				+    "    if(key in subSeriesKeys):\n",
			
 
				+    "        subSeriesHeader[key] = itemHeader[key]\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "with open(export_dir + \"data_subseries.csv\", \"w\", newline=\"\") as csv_file:\n",
			
 
				+    "    writer = csv.DictWriter(csv_file, fieldnames=list(subSeriesHeader.keys()))\n",
			
 
				+    "    writer.writeheader()\n",
			
 
				+    "    writer.writerow(subSeriesHeader)\n",
			
 
				+    "    for ii in range(len(allCs2['subseries'])):\n",
			
 
				+    "        test = allCs2['subseries'][ii]\n",
			
 
				+    "        writer.writerow(traduttoreItem(test))\n",
			
 
				+    "\n",
			
 
				+    "print('Tempo trascorso:', datetime.timestamp(datetime.now()) - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "*Rinse & Repeat* con i livelli *series*, *subfonds* e *fonds*"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 15,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Tempo trascorso: 0.14042901992797852\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "seriesKeys = set()\n",
			
 
				+    "for ii in range(len(allCs2['series'])):\n",
			
 
				+    "    test = allCs2['series'][ii]\n",
			
 
				+    "    seriesKeys = seriesKeys.union( traduttoreItem(test).keys() )\n",
			
 
				+    "\n",
			
 
				+    "seriesHeader = OrderedDict()\n",
			
 
				+    "for key in itemHeader:\n",
			
 
				+    "    if(key in seriesKeys):\n",
			
 
				+    "        seriesHeader[key] = itemHeader[key]\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "with open(export_dir + \"data_series.csv\", \"w\", newline=\"\") as csv_file:\n",
			
 
				+    "    writer = csv.DictWriter(csv_file, fieldnames=list(seriesHeader.keys()))\n",
			
 
				+    "    writer.writeheader()\n",
			
 
				+    "    writer.writerow(seriesHeader)\n",
			
 
				+    "    for ii in range(len(allCs2['series'])):\n",
			
 
				+    "        test = allCs2['series'][ii]\n",
			
 
				+    "        writer.writerow(traduttoreItem(test))\n",
			
 
				+    "\n",
			
 
				+    "print('Tempo trascorso:', datetime.timestamp(datetime.now()) - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 16,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Tempo trascorso: 0.004575967788696289\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "subfondsKeys = set()\n",
			
 
				+    "for ii in range(len(allCs2['subfonds'])):\n",
			
 
				+    "    test = allCs2['subfonds'][ii]\n",
			
 
				+    "    subfondsKeys = subfondsKeys.union( traduttoreItem(test).keys() )\n",
			
 
				+    "\n",
			
 
				+    "subfondsHeader = OrderedDict()\n",
			
 
				+    "for key in itemHeader:\n",
			
 
				+    "    if(key in subfondsKeys):\n",
			
 
				+    "        subfondsHeader[key] = itemHeader[key]\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "with open(export_dir + \"data_subfonds.csv\", \"w\", newline=\"\") as csv_file:\n",
			
 
				+    "    writer = csv.DictWriter(csv_file, fieldnames=list(subfondsHeader.keys()))\n",
			
 
				+    "    writer.writeheader()\n",
			
 
				+    "    writer.writerow(subfondsHeader)\n",
			
 
				+    "    for ii in range(len(allCs2['subfonds'])):\n",
			
 
				+    "        test = allCs2['subfonds'][ii]\n",
			
 
				+    "        writer.writerow(traduttoreItem(test))\n",
			
 
				+    "\n",
			
 
				+    "print('Tempo trascorso:', datetime.timestamp(datetime.now()) - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 17,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Tempo trascorso: 0.006090879440307617\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "fondsKeys = set()\n",
			
 
				+    "for ii in range(len(allCs2['fonds'])):\n",
			
 
				+    "    test = allCs2['fonds'][ii]\n",
			
 
				+    "    fondsKeys = fondsKeys.union( traduttoreItem(test).keys() )\n",
			
 
				+    "\n",
			
 
				+    "fondsHeader = OrderedDict()\n",
			
 
				+    "for key in itemHeader:\n",
			
 
				+    "    if(key in fondsKeys):\n",
			
 
				+    "        fondsHeader[key] = itemHeader[key]\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "with open(export_dir + \"data_fonds.csv\", \"w\", newline=\"\") as csv_file:\n",
			
 
				+    "    writer = csv.DictWriter(csv_file, fieldnames=list(fondsHeader.keys()))\n",
			
 
				+    "    writer.writeheader()\n",
			
 
				+    "    writer.writerow(fondsHeader)\n",
			
 
				+    "    for ii in range(len(allCs2['fonds'])):\n",
			
 
				+    "        test = allCs2['fonds'][ii]\n",
			
 
				+    "        writer.writerow(traduttoreItem(test))\n",
			
 
				+    "\n",
			
 
				+    "print('Tempo trascorso:', datetime.timestamp(datetime.now()) - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 18,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Tempo trascorso: 0.001628875732421875\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "collectionKeys = set()\n",
			
 
				+    "for ii in range(len(allCs2['collection'])):\n",
			
 
				+    "    test = allCs2['collection'][ii]\n",
			
 
				+    "    collectionKeys = collectionKeys.union( traduttoreItem(test).keys() )\n",
			
 
				+    "\n",
			
 
				+    "collectionHeader = OrderedDict()\n",
			
 
				+    "for key in itemHeader:\n",
			
 
				+    "    if(key in collectionKeys):\n",
			
 
				+    "        collectionHeader[key] = itemHeader[key]\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "with open(export_dir + \"data_collection.csv\", \"w\", newline=\"\") as csv_file:\n",
			
 
				+    "    writer = csv.DictWriter(csv_file, fieldnames=list(collectionHeader.keys()))\n",
			
 
				+    "    writer.writeheader()\n",
			
 
				+    "    writer.writerow(collectionHeader)\n",
			
 
				+    "    for ii in range(len(allCs2['collection'])):\n",
			
 
				+    "        test = allCs2['collection'][ii]\n",
			
 
				+    "        writer.writerow(traduttoreItem(test))\n",
			
 
				+    "\n",
			
 
				+    "print('Tempo trascorso:', datetime.timestamp(datetime.now()) - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 19,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Tempo trascorso: 0.003462076187133789\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "subSeriesKeys = set()\n",
			
 
				+    "for ii in range(len(allCs2['recordgrp'])):\n",
			
 
				+    "    test = allCs2['recordgrp'][ii]\n",
			
 
				+    "    subSeriesKeys = subSeriesKeys.union( traduttoreItem(test).keys() )\n",
			
 
				+    "\n",
			
 
				+    "subSeriesHeader = OrderedDict()\n",
			
 
				+    "for key in itemHeader:\n",
			
 
				+    "    if(key in subSeriesKeys):\n",
			
 
				+    "        subSeriesHeader[key] = itemHeader[key]\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "with open(export_dir + \"data_recordgrp.csv\", \"w\", newline=\"\") as csv_file:\n",
			
 
				+    "    writer = csv.DictWriter(csv_file, fieldnames=list(subSeriesHeader.keys()))\n",
			
 
				+    "    writer.writeheader()\n",
			
 
				+    "    writer.writerow(subSeriesHeader)\n",
			
 
				+    "    for ii in range(len(allCs2['recordgrp'])):\n",
			
 
				+    "        test = allCs2['recordgrp'][ii]\n",
			
 
				+    "        writer.writerow(traduttoreItem(test))\n",
			
 
				+    "\n",
			
 
				+    "print('Tempo trascorso:', datetime.timestamp(datetime.now()) - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 20,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Tempo trascorso: 0.1351768970489502\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "ts1 = datetime.timestamp(datetime.now())\n",
			
 
				+    "\n",
			
 
				+    "otherlevelKeys = set()\n",
			
 
				+    "for ii in range(len(allCs2['otherlevel'])):\n",
			
 
				+    "    test = allCs2['otherlevel'][ii]\n",
			
 
				+    "    otherlevelKeys = otherlevelKeys.union( traduttoreItem(test).keys() )\n",
			
 
				+    "\n",
			
 
				+    "otherlevelHeader = OrderedDict()\n",
			
 
				+    "for key in itemHeader:\n",
			
 
				+    "    if(key in otherlevelKeys):\n",
			
 
				+    "        otherlevelHeader[key] = itemHeader[key]\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "with open(export_dir + \"data_otherlevel.csv\", \"w\", newline=\"\") as csv_file:\n",
			
 
				+    "    writer = csv.DictWriter(csv_file, fieldnames=list(otherlevelHeader.keys()))\n",
			
 
				+    "    writer.writeheader()\n",
			
 
				+    "    writer.writerow(otherlevelHeader)\n",
			
 
				+    "    for ii in range(len(allCs2['otherlevel'])):\n",
			
 
				+    "        test = allCs2['otherlevel'][ii]\n",
			
 
				+    "        writer.writerow(traduttoreItem(test))\n",
			
 
				+    "\n",
			
 
				+    "print('Tempo trascorso:', datetime.timestamp(datetime.now()) - ts1)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 21,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "{'altro_livello',\n",
			
 
				+       " 'audience',\n",
			
 
				+       " 'conservazione',\n",
			
 
				+       " 'data_periodo',\n",
			
 
				+       " 'dimensione_altezza_larghezza_spessore',\n",
			
 
				+       " 'genere',\n",
			
 
				+       " 'id',\n",
			
 
				+       " 'id_file',\n",
			
 
				+       " 'id_fonds',\n",
			
 
				+       " 'id_recordgrp',\n",
			
 
				+       " 'id_series',\n",
			
 
				+       " 'id_subfonds',\n",
			
 
				+       " 'id_subgrp',\n",
			
 
				+       " 'id_subseries',\n",
			
 
				+       " 'nota',\n",
			
 
				+       " 'numero',\n",
			
 
				+       " 'persona',\n",
			
 
				+       " 'scope-content_body',\n",
			
 
				+       " 'segnatura_attuale',\n",
			
 
				+       " 'segnatura_precedente',\n",
			
 
				+       " 'supporto',\n",
			
 
				+       " 'titolo_aspo'}"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 21,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "otherlevelKeys = set()\n",
			
 
				+    "for ii in range(len(allCs2['otherlevel'])):\n",
			
 
				+    "    test = allCs2['otherlevel'][ii]\n",
			
 
				+    "    otherlevelKeys = otherlevelKeys.union( traduttoreItem(test).keys() )\n",
			
 
				+    "\n",
			
 
				+    "otherlevelKeys"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 22,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "odict_keys(['id', 'audience', 'altro_livello', 'repository', 'tipologia', 'segnatura_attuale', 'segnatura_precedente', 'titolo_aspo', 'scope-content_head', 'scope-content_body', 'compagnia', 'soggetto', 'persona', 'data', 'data_periodo', 'supporto', 'numero', 'genere', 'dimensione_altezza_larghezza_spessore', 'conservazione', 'consistenza', 'nota', 'oggetto_digitale', 'id_subfonds', 'id_fonds', 'id_series', 'id_subseries', 'id_recordgrp', 'id_otherlevel', 'id_collection', 'id_subgrp', 'id_file'])"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 22,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "otherlevelHeader = OrderedDict()\n",
			
 
				+    "for key in itemHeader:\n",
			
 
				+    "    if(key in otherlevelKeys):\n",
			
 
				+    "        otherlevelHeader[key] = itemHeader[key]\n",
			
 
				+    "\n",
			
 
				+    "otherlevelHeader\n",
			
 
				+    "\n",
			
 
				+    "itemHeader.keys()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 23,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "OrderedDict([('id', '<c level=\"X\" id=#>'),\n",
			
 
				+       "             ('audience', '<c level=\"#\" audience=#>'),\n",
			
 
				+       "             ('altro_livello', '<c otherlevel=#>'),\n",
			
 
				+       "             ('segnatura_attuale', '<num type=\"nuovo ordinamento\">#'),\n",
			
 
				+       "             ('segnatura_precedente', '<odd>#'),\n",
			
 
				+       "             ('titolo_aspo', '<unittitle>#'),\n",
			
 
				+       "             ('scope-content_body', '<scopecontent><p>#'),\n",
			
 
				+       "             ('persona', '<persname authfilenumber=#>#'),\n",
			
 
				+       "             ('data_periodo', '<unitdate>#'),\n",
			
 
				+       "             ('supporto', '<physfacet type=\"supporto\">#'),\n",
			
 
				+       "             ('numero', '<extent>#'),\n",
			
 
				+       "             ('genere', '<genreform>#'),\n",
			
 
				+       "             ('dimensione_altezza_larghezza_spessore', '<dimensions>#'),\n",
			
 
				+       "             ('conservazione', '<phystech>#'),\n",
			
 
				+       "             ('nota', '<note>#'),\n",
			
 
				+       "             ('id_subfonds', '<c level=\"subfonds\" id=#>'),\n",
			
 
				+       "             ('id_fonds', '<c level=\"fonds\" id=#>'),\n",
			
 
				+       "             ('id_series', '<c level=\"series\" id=#>'),\n",
			
 
				+       "             ('id_subseries', '<c level=\"subseries\" id=#>'),\n",
			
 
				+       "             ('id_recordgrp', '<c level=\"recordgrp\" id=#>'),\n",
			
 
				+       "             ('id_subgrp', '<c level=\"subgrp\" id=#>'),\n",
			
 
				+       "             ('id_file', '<c level=\"file\" id=#>')])"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 23,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "otherlevelHeader"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 24,
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "data": {
			
 
				+      "text/plain": [
			
 
				+       "OrderedDict([('id', '<c level=\"X\" id=#>'),\n",
			
 
				+       "             ('audience', '<c level=\"#\" audience=#>'),\n",
			
 
				+       "             ('altro_livello', '<c otherlevel=#>'),\n",
			
 
				+       "             ('repository', '<repository>#'),\n",
			
 
				+       "             ('tipologia', '<materialspec label=\"tipologia\">#'),\n",
			
 
				+       "             ('segnatura_attuale', '<num type=\"nuovo ordinamento\">#'),\n",
			
 
				+       "             ('segnatura_precedente', '<odd>#'),\n",
			
 
				+       "             ('titolo_aspo', '<unittitle>#'),\n",
			
 
				+       "             ('scope-content_head', '<scopecontent><head>#'),\n",
			
 
				+       "             ('scope-content_body', '<scopecontent><p>#'),\n",
			
 
				+       "             ('compagnia', '<corpname>#'),\n",
			
 
				+       "             ('soggetto', '<subject>#'),\n",
			
 
				+       "             ('persona', '<persname authfilenumber=#>#'),\n",
			
 
				+       "             ('data', '<date>#'),\n",
			
 
				+       "             ('data_periodo', '<unitdate>#'),\n",
			
 
				+       "             ('supporto', '<physfacet type=\"supporto\">#'),\n",
			
 
				+       "             ('numero', '<extent>#'),\n",
			
 
				+       "             ('genere', '<genreform>#'),\n",
			
 
				+       "             ('dimensione_altezza_larghezza_spessore', '<dimensions>#'),\n",
			
 
				+       "             ('conservazione', '<phystech>#'),\n",
			
 
				+       "             ('consistenza', '<extent unit=#1>#2, #1: #2'),\n",
			
 
				+       "             ('nota', '<note>#'),\n",
			
 
				+       "             ('oggetto_digitale', '<daoloc title=#>'),\n",
			
 
				+       "             ('id_subfonds', '<c level=\"subfonds\" id=#>'),\n",
			
 
				+       "             ('id_fonds', '<c level=\"fonds\" id=#>'),\n",
			
 
				+       "             ('id_series', '<c level=\"series\" id=#>'),\n",
			
 
				+       "             ('id_subseries', '<c level=\"subseries\" id=#>'),\n",
			
 
				+       "             ('id_recordgrp', '<c level=\"recordgrp\" id=#>'),\n",
			
 
				+       "             ('id_otherlevel',\n",
			
 
				+       "              '<c level=\"otherlevel\" id=# otherlevel=\"subfile\">'),\n",
			
 
				+       "             ('id_collection', '<c level=\"collection\" id=#>'),\n",
			
 
				+       "             ('id_subgrp', '<c level=\"subgrp\" id=#>'),\n",
			
 
				+       "             ('id_file', '<c level=\"file\" id=#>')])"
			
 
				+      ]
			
 
				+     },
			
 
				+     "execution_count": 24,
			
 
				+     "metadata": {},
			
 
				+     "output_type": "execute_result"
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "itemHeader"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "interpreter": {
			
 
				+   "hash": "36cf16204b8548560b1c020c4e8fb5b57f0e4c58016f52f2d4be01e192833930"
			
 
				+  },
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3.7.3 64-bit",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.9.5"
			
 
				+  },
			
 
				+  "metadata": {
			
 
				+   "interpreter": {
			
 
				+    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
			
 
				+   }
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 2
			
 
				+}