3 years ago · 29d7afcd85
--- a/CSV_to_RDF/CSV_to_RDF_ASPOPeople.py
+++ b/CSV_to_RDF/CSV_to_RDF_ASPOPeople.py
@@ -0,0 +1,222 @@
 
				+# Utilities to read/write csv files
			
 
				+import csv
			
 
				+import re
			
 
				+# Utilities to handle character encodings
			
 
				+import unicodedata
			
 
				+# Ordered Dicts
			
 
				+from collections import OrderedDict
			
 
				+
			
 
				+import json
			
 
				+
			
 
				+
			
 
				+# OPZIONAL IMPORTS
			
 
				+
			
 
				+# For timestamping/simple speed tests
			
 
				+from datetime import datetime
			
 
				+# Random number generator
			
 
				+from random import *
			
 
				+# System & command line utilities
			
 
				+import sys
			
 
				+# Json for the dictionary
			
 
				+import json
			
 
				+
			
 
				+import_dir = '/Users/alessiaspadi/Documents/RESTORE/temp_ASPO/'
			
 
				+export_dir = '/Users/alessiaspadi/Documents/RESTORE/temp_ASPO/'
			
 
				+
			
 
				+
			
 
				+# Custom class to store URIs + related infos for the ontologies/repositories
			
 
				+
			
 
				+class RDFcoords:
			
 
				+    def __init__(self, uri, prefix, code = None):
			
 
				+        self.uri = uri
			
 
				+        self.prefix = prefix
			
 
				+        self.code = code
			
 
				+
			
 
				+
			
 
				+# Repositories
			
 
				+aspoCoords = RDFcoords('<http://www.archiviodistato.prato.it/accedi-e-consulta/aspoMV001/scheda/>', 'aspo:')
			
 
				+foafCoords = RDFcoords('<http://xmlns.com/foaf/0.1/>', 'foaf:')
			
 
				+cidocCoords = RDFcoords('<http://www.cidoc-crm.org/cidoc-crm/>', 'crm:')
			
 
				+schemaCoords = RDFcoords('<http://schema.org/>', 'schema:')
			
 
				+personCoords = RDFcoords('<http://www.w3.org/ns/person#>', 'person:')
			
 
				+nsCoords = RDFcoords('<http://www.w3.org/1999/02/22-rdf-syntax-ns#>', 'rdf:')
			
 
				+rdfsCoords = RDFcoords('<http://www.w3.org/2000/01/rdf-schema#>', 'rdfs:')
			
 
				+
			
 
				+
			
 
				+# Basic functions for triples / shortened triples in TTL format
			
 
				+
			
 
				+def triple(subject, predicate, object1):
			
 
				+    line = subject + ' ' + predicate + ' ' + object1
			
 
				+    return line
			
 
				+
			
 
				+def doublet(predicate, object1):
			
 
				+    line = '    ' + predicate + ' ' + object1
			
 
				+    return line
			
 
				+
			
 
				+def singlet(object1):
			
 
				+    line = '        ' + object1
			
 
				+    return line
			
 
				+
			
 
				+# Line endings in TTL format
			
 
				+continueLine1 = ' ;\n'
			
 
				+continueLine2 = ' ,\n'
			
 
				+closeLine = ' .\n'
			
 
				+
			
 
				+def writeTTLHeader(output):
			
 
				+    output.write('@prefix ' + aspoCoords.prefix + ' ' + aspoCoords.uri + closeLine)
			
 
				+    output.write('@prefix ' + foafCoords.prefix + ' ' + foafCoords.uri + closeLine)
			
 
				+    output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine)
			
 
				+    output.write('@prefix ' + personCoords.prefix + ' ' + personCoords.uri + closeLine)
			
 
				+    output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine)
			
 
				+    output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine)
			
 
				+    output.write('@prefix ' + rdfsCoords.prefix + ' ' + rdfsCoords.uri + closeLine)
			
 
				+
			
 
				+    output.write('\n')
			
 
				+
			
 
				+
			
 
				+filePrefix = 'Onomastica_'
			
 
				+fileType = 'Datini'
			
 
				+max_entries = 1000000000
			
 
				+
			
 
				+with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file, open(
			
 
				+        export_dir + filePrefix + fileType + '.ttl', 'w') as output:
			
 
				+    reader = csv.DictReader(csv_file)
			
 
				+    writeTTLHeader(output)
			
 
				+    first = True
			
 
				+    ii = 0
			
 
				+    for row in reader:
			
 
				+        # The index ii is used to process a limited number of entries for testing purposes
			
 
				+        ii = ii + 1
			
 
				+        if row['entityType'] == 'person':
			
 
				+
			
 
				+            id_aspo = row['recordId']
			
 
				+
			
 
				+            #placeHolders
			
 
				+            aspoPlaceHolder = aspoCoords.prefix + id_aspo
			
 
				+
			
 
				+            line = triple(aspoPlaceHolder,
			
 
				+                          nsCoords.prefix + 'type',
			
 
				+                          foafCoords.prefix + 'person') + closeLine
			
 
				+            output.write(line)
			
 
				+            line = triple(aspoPlaceHolder,
			
 
				+                          foafCoords.prefix + 'name',
			
 
				+                          '\"' + row['nameEntry@normal'] + '\"') + closeLine
			
 
				+            output.write(line)
			
 
				+
			
 
				+            if row['nome proprio'] != '':
			
 
				+                name = row['nome proprio'].replace("\n", "")
			
 
				+                line = triple(aspoPlaceHolder,
			
 
				+                              foafCoords.prefix + 'givenName',
			
 
				+                              '\"' + name + '\"') + closeLine
			
 
				+                output.write(line)
			
 
				+
			
 
				+            if row['nome di famiglia'] != '':
			
 
				+                familyName = row['nome di famiglia'].replace("\n", "")
			
 
				+                line = triple(aspoPlaceHolder,
			
 
				+                              foafCoords.prefix + 'familyName',
			
 
				+                              '\"' + familyName + '\"') + closeLine
			
 
				+                output.write(line)
			
 
				+
			
 
				+            if row['Alias'] != '':
			
 
				+                line = triple(aspoPlaceHolder,
			
 
				+                              schemaCoords.prefix + 'alternateName',
			
 
				+                              '\"' + row['Alias'] + '\"') + closeLine
			
 
				+                output.write(line)
			
 
				+
			
 
				+            if row['genere'] != '':
			
 
				+                line = triple(aspoPlaceHolder,
			
 
				+                              foafCoords.prefix + 'gender',
			
 
				+                              '\"' + row['genere'] + '\"') + closeLine
			
 
				+                output.write(line)
			
 
				+
			
 
				+            if row['patronimico/matronimico'] != '':
			
 
				+                pp = row['patronimico/matronimico']
			
 
				+                patronimyc = pp.replace("\n", "")
			
 
				+                line = triple(aspoPlaceHolder,
			
 
				+                              personCoords.prefix + 'patronymicName',
			
 
				+                              '\"' + patronimyc + '\"') + closeLine
			
 
				+                output.write(line)
			
 
				+
			
 
				+            if row['occupation'] != '' and row['occupation'] != '\n':
			
 
				+                occ = row['occupation']
			
 
				+                occupation = re.sub(r'[^A-Za-z]', '', occ)
			
 
				+                line = triple(aspoPlaceHolder,
			
 
				+                              schemaCoords.prefix + 'hasOccupation',
			
 
				+                              aspoCoords.prefix + occupation) + closeLine
			
 
				+                output.write(line)
			
 
				+                line = triple(aspoCoords.prefix + occupation,
			
 
				+                              nsCoords.prefix + 'type',
			
 
				+                              schemaCoords.prefix + 'Occupation') + closeLine
			
 
				+                output.write(line)
			
 
				+                line = triple(aspoCoords.prefix + occupation,
			
 
				+                              rdfsCoords.prefix + 'label',
			
 
				+                              '\"' + row['occupation'] + '\"') + closeLine
			
 
				+                output.write(line)
			
 
				+
			
 
				+            if row['avo 1'] != '':
			
 
				+                avo1 = row['avo 1'].replace('di ', '')
			
 
				+                avo1card = re.sub(r'[^A-Za-z]', '', avo1)
			
 
				+                line = triple(aspoPlaceHolder,
			
 
				+                              schemaCoords.prefix + 'relatedTo',
			
 
				+                              aspoCoords.prefix + avo1card) + closeLine
			
 
				+                output.write(line)
			
 
				+                line = triple(aspoCoords.prefix + avo1card,
			
 
				+                              nsCoords.prefix + 'type',
			
 
				+                              foafCoords.prefix + 'Person') + closeLine
			
 
				+                output.write(line)
			
 
				+                line = triple(aspoCoords.prefix + avo1card,
			
 
				+                              rdfsCoords.prefix + 'label',
			
 
				+                              '\"' + avo1 + '\"') + closeLine
			
 
				+                output.write(line)
			
 
				+
			
 
				+            if row['avo 2'] != '':
			
 
				+                avo2 = row['avo 2'].replace('di ', '')
			
 
				+                avo2card = re.sub(r'[^A-Za-z]', '', avo2)
			
 
				+                line = triple(aspoPlaceHolder,
			
 
				+                              schemaCoords.prefix + 'relatedTo',
			
 
				+                              aspoCoords.prefix + avo2card) + closeLine
			
 
				+                output.write(line)
			
 
				+                line = triple(aspoCoords.prefix + avo2card,
			
 
				+                              nsCoords.prefix + 'type',
			
 
				+                              foafCoords.prefix + 'Person') + closeLine
			
 
				+                output.write(line)
			
 
				+                line = triple(aspoCoords.prefix + avo2card,
			
 
				+                              rdfsCoords.prefix + 'label',
			
 
				+                              '\"' + avo2 + '\"') + closeLine
			
 
				+                output.write(line)
			
 
				+
			
 
				+            if row['Qualifica'] != '':
			
 
				+                qq = row['Qualifica']
			
 
				+                qualifiche = []
			
 
				+                if '|' in qq:
			
 
				+                    qualifiche = qq.split('|')
			
 
				+                else:
			
 
				+                    qualifiche.append(qq)
			
 
				+                for qualifica in qualifiche:
			
 
				+                    honorific = qualifica.replace("\n", "")
			
 
				+                    line = triple(aspoPlaceHolder,
			
 
				+                                  schemaCoords.prefix + 'honorificPrefix',
			
 
				+                                  '\"' + honorific + '\"') + closeLine
			
 
				+                    output.write(line)
			
 
				+
			
 
				+            if row['place_occupation_Qualifica'] != '':
			
 
				+                line = triple(aspoPlaceHolder,
			
 
				+                              schemaCoords.prefix + 'workLocation',
			
 
				+                              '\"' + row['place_occupation_Qualifica'] + '\"') + closeLine
			
 
				+                output.write(line)
			
 
				+
			
 
				+            if row['biogHist p'] != '':
			
 
				+                bio = row['biogHist p']
			
 
				+                biog = bio.replace("\n", " ")
			
 
				+                line = triple(aspoPlaceHolder,
			
 
				+                              schemaCoords.prefix + 'description',
			
 
				+                              '\"' + biog + '\"') + closeLine
			
 
				+                output.write(line)
			
 
				+
			
 
				+
			
 
				+        output.write('\n')
			
 
				+        #
			
 
				+        #
			
 
				+        # Limit number of entries processed (if desired)
			
 
				+        if (ii > max_entries):
			
 
				+            break