Browse Source

Carica file su 'CSV_to_RDF'

Alessia 2 years ago
parent
commit
29d7afcd85
1 changed files with 222 additions and 0 deletions
  1. 222 0
      CSV_to_RDF/CSV_to_RDF_ASPOPeople.py

+ 222 - 0
CSV_to_RDF/CSV_to_RDF_ASPOPeople.py

@@ -0,0 +1,222 @@
+# Utilities to read/write csv files
+import csv
+import re
+# Utilities to handle character encodings
+import unicodedata
+# Ordered Dicts
+from collections import OrderedDict
+
+import json
+
+
+# OPZIONAL IMPORTS
+
+# For timestamping/simple speed tests
+from datetime import datetime
+# Random number generator
+from random import *
+# System & command line utilities
+import sys
+# Json for the dictionary
+import json
+
+import_dir = '/Users/alessiaspadi/Documents/RESTORE/temp_ASPO/'
+export_dir = '/Users/alessiaspadi/Documents/RESTORE/temp_ASPO/'
+
+
+# Custom class to store URIs + related infos for the ontologies/repositories
+
+class RDFcoords:
+    def __init__(self, uri, prefix, code = None):
+        self.uri = uri
+        self.prefix = prefix
+        self.code = code
+
+
+# Repositories
+aspoCoords = RDFcoords('<http://www.archiviodistato.prato.it/accedi-e-consulta/aspoMV001/scheda/>', 'aspo:')
+foafCoords = RDFcoords('<http://xmlns.com/foaf/0.1/>', 'foaf:')
+cidocCoords = RDFcoords('<http://www.cidoc-crm.org/cidoc-crm/>', 'crm:')
+schemaCoords = RDFcoords('<http://schema.org/>', 'schema:')
+personCoords = RDFcoords('<http://www.w3.org/ns/person#>', 'person:')
+nsCoords = RDFcoords('<http://www.w3.org/1999/02/22-rdf-syntax-ns#>', 'rdf:')
+rdfsCoords = RDFcoords('<http://www.w3.org/2000/01/rdf-schema#>', 'rdfs:')
+
+
+# Basic functions for triples / shortened triples in TTL format
+
+def triple(subject, predicate, object1):
+    line = subject + ' ' + predicate + ' ' + object1
+    return line
+
+def doublet(predicate, object1):
+    line = '    ' + predicate + ' ' + object1
+    return line
+
+def singlet(object1):
+    line = '        ' + object1
+    return line
+
+# Line endings in TTL format
+continueLine1 = ' ;\n'
+continueLine2 = ' ,\n'
+closeLine = ' .\n'
+
+def writeTTLHeader(output):
+    output.write('@prefix ' + aspoCoords.prefix + ' ' + aspoCoords.uri + closeLine)
+    output.write('@prefix ' + foafCoords.prefix + ' ' + foafCoords.uri + closeLine)
+    output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine)
+    output.write('@prefix ' + personCoords.prefix + ' ' + personCoords.uri + closeLine)
+    output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine)
+    output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine)
+    output.write('@prefix ' + rdfsCoords.prefix + ' ' + rdfsCoords.uri + closeLine)
+
+    output.write('\n')
+
+
+filePrefix = 'Onomastica_'
+fileType = 'Datini'
+max_entries = 1000000000
+
+with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file, open(
+        export_dir + filePrefix + fileType + '.ttl', 'w') as output:
+    reader = csv.DictReader(csv_file)
+    writeTTLHeader(output)
+    first = True
+    ii = 0
+    for row in reader:
+        # The index ii is used to process a limited number of entries for testing purposes
+        ii = ii + 1
+        if row['entityType'] == 'person':
+
+            id_aspo = row['recordId']
+
+            #placeHolders
+            aspoPlaceHolder = aspoCoords.prefix + id_aspo
+
+            line = triple(aspoPlaceHolder,
+                          nsCoords.prefix + 'type',
+                          foafCoords.prefix + 'person') + closeLine
+            output.write(line)
+            line = triple(aspoPlaceHolder,
+                          foafCoords.prefix + 'name',
+                          '\"' + row['nameEntry@normal'] + '\"') + closeLine
+            output.write(line)
+
+            if row['nome proprio'] != '':
+                name = row['nome proprio'].replace("\n", "")
+                line = triple(aspoPlaceHolder,
+                              foafCoords.prefix + 'givenName',
+                              '\"' + name + '\"') + closeLine
+                output.write(line)
+
+            if row['nome di famiglia'] != '':
+                familyName = row['nome di famiglia'].replace("\n", "")
+                line = triple(aspoPlaceHolder,
+                              foafCoords.prefix + 'familyName',
+                              '\"' + familyName + '\"') + closeLine
+                output.write(line)
+
+            if row['Alias'] != '':
+                line = triple(aspoPlaceHolder,
+                              schemaCoords.prefix + 'alternateName',
+                              '\"' + row['Alias'] + '\"') + closeLine
+                output.write(line)
+
+            if row['genere'] != '':
+                line = triple(aspoPlaceHolder,
+                              foafCoords.prefix + 'gender',
+                              '\"' + row['genere'] + '\"') + closeLine
+                output.write(line)
+
+            if row['patronimico/matronimico'] != '':
+                pp = row['patronimico/matronimico']
+                patronimyc = pp.replace("\n", "")
+                line = triple(aspoPlaceHolder,
+                              personCoords.prefix + 'patronymicName',
+                              '\"' + patronimyc + '\"') + closeLine
+                output.write(line)
+
+            if row['occupation'] != '' and row['occupation'] != '\n':
+                occ = row['occupation']
+                occupation = re.sub(r'[^A-Za-z]', '', occ)
+                line = triple(aspoPlaceHolder,
+                              schemaCoords.prefix + 'hasOccupation',
+                              aspoCoords.prefix + occupation) + closeLine
+                output.write(line)
+                line = triple(aspoCoords.prefix + occupation,
+                              nsCoords.prefix + 'type',
+                              schemaCoords.prefix + 'Occupation') + closeLine
+                output.write(line)
+                line = triple(aspoCoords.prefix + occupation,
+                              rdfsCoords.prefix + 'label',
+                              '\"' + row['occupation'] + '\"') + closeLine
+                output.write(line)
+
+            if row['avo 1'] != '':
+                avo1 = row['avo 1'].replace('di ', '')
+                avo1card = re.sub(r'[^A-Za-z]', '', avo1)
+                line = triple(aspoPlaceHolder,
+                              schemaCoords.prefix + 'relatedTo',
+                              aspoCoords.prefix + avo1card) + closeLine
+                output.write(line)
+                line = triple(aspoCoords.prefix + avo1card,
+                              nsCoords.prefix + 'type',
+                              foafCoords.prefix + 'Person') + closeLine
+                output.write(line)
+                line = triple(aspoCoords.prefix + avo1card,
+                              rdfsCoords.prefix + 'label',
+                              '\"' + avo1 + '\"') + closeLine
+                output.write(line)
+
+            if row['avo 2'] != '':
+                avo2 = row['avo 2'].replace('di ', '')
+                avo2card = re.sub(r'[^A-Za-z]', '', avo2)
+                line = triple(aspoPlaceHolder,
+                              schemaCoords.prefix + 'relatedTo',
+                              aspoCoords.prefix + avo2card) + closeLine
+                output.write(line)
+                line = triple(aspoCoords.prefix + avo2card,
+                              nsCoords.prefix + 'type',
+                              foafCoords.prefix + 'Person') + closeLine
+                output.write(line)
+                line = triple(aspoCoords.prefix + avo2card,
+                              rdfsCoords.prefix + 'label',
+                              '\"' + avo2 + '\"') + closeLine
+                output.write(line)
+
+            if row['Qualifica'] != '':
+                qq = row['Qualifica']
+                qualifiche = []
+                if '|' in qq:
+                    qualifiche = qq.split('|')
+                else:
+                    qualifiche.append(qq)
+                for qualifica in qualifiche:
+                    honorific = qualifica.replace("\n", "")
+                    line = triple(aspoPlaceHolder,
+                                  schemaCoords.prefix + 'honorificPrefix',
+                                  '\"' + honorific + '\"') + closeLine
+                    output.write(line)
+
+            if row['place_occupation_Qualifica'] != '':
+                line = triple(aspoPlaceHolder,
+                              schemaCoords.prefix + 'workLocation',
+                              '\"' + row['place_occupation_Qualifica'] + '\"') + closeLine
+                output.write(line)
+
+            if row['biogHist p'] != '':
+                bio = row['biogHist p']
+                biog = bio.replace("\n", " ")
+                line = triple(aspoPlaceHolder,
+                              schemaCoords.prefix + 'description',
+                              '\"' + biog + '\"') + closeLine
+                output.write(line)
+
+
+        output.write('\n')
+        #
+        #
+        # Limit number of entries processed (if desired)
+        if (ii > max_entries):
+            break