# Utilities to read/write csv files
import csv
# Utilities to handle character encodings
import unicodedata
# Ordered Dicts
from collections import OrderedDict
import json
# OPZIONAL IMPORTS
# For timestamping/simple speed tests
from datetime import datetime
# Random number generator
from random import *
# System & command line utilities
import sys
# Json for the dictionary
import json
import_dir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/MPP/CSV/corretti/'
export_dir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/MPP/RDF/'
# Custom class to store URIs + related infos for the ontologies/repositories
class RDFcoords:
def __init__(self, uri, prefix, code = None):
self.uri = uri
self.prefix = prefix
self.code = code
# Repositories
museoCoords = RDFcoords('', 'mpp:')
autCoords = RDFcoords('', 'aut:')
foafCoords = RDFcoords('', 'foaf:')
cidocCoords = RDFcoords('', 'crm:')
aatCoords = RDFcoords('', 'aat:')
nsCoords = RDFcoords('', 'rdf:')
schemaCoords = RDFcoords('', 'schema:')
rdfsCoords = RDFcoords('', 'rdfs:')
# Basic functions for triples / shortened triples in TTL format
def triple(subject, predicate, object1):
line = subject + ' ' + predicate + ' ' + object1
return line
def doublet(predicate, object1):
line = ' ' + predicate + ' ' + object1
return line
def singlet(object1):
line = ' ' + object1
return line
# Line endings in TTL format
continueLine1 = ' ;\n'
continueLine2 = ' ,\n'
closeLine = ' .\n'
def writeTTLHeader(output):
output.write('@prefix ' + museoCoords.prefix + ' ' + museoCoords.uri + closeLine)
output.write('@prefix ' + foafCoords.prefix + ' ' + foafCoords.uri + closeLine)
output.write('@prefix ' + autCoords.prefix + ' ' + autCoords.uri + closeLine)
output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine)
output.write('@prefix ' + aatCoords.prefix + ' ' + aatCoords.uri + closeLine)
output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine)
output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine)
output.write('@prefix ' + rdfsCoords.prefix + ' ' + rdfsCoords.uri + closeLine)
output.write('\n')
filePrefix = 'AR20AUT_'
fileType = 'Datini'
max_entries = 1000000000
with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file, open(
export_dir + filePrefix + fileType + '.ttl', 'w') as output:
reader = csv.DictReader(csv_file)
writeTTLHeader(output)
first = True
ii = 0
for row in reader:
# The index ii is used to process a limited number of entries for testing purposes
ii = ii + 1
url = row['URL']
#placeHolders
e21placeHolder = autCoords.prefix + url
e62placeHolder = autCoords.prefix + url + '_E62'
e41placeHolder = autCoords.prefix + url + '_E41'
e42placeHolder = autCoords.prefix + row['AUTH']
e67placeHolder = autCoords.prefix + url + '_E67'
e69placeHolder = autCoords.prefix + url + '_E69'
line = triple(e21placeHolder, nsCoords.prefix + 'type',
cidocCoords.prefix + 'E21_Person') + closeLine
output.write(line)
line = triple(e21placeHolder,
nsCoords.prefix + 'type',
foafCoords.prefix + 'person') + closeLine
output.write(line)
if row['AUTN'] != '':
line = triple(e21placeHolder,
foafCoords.prefix + 'name',
'\"' + row['AUTN'] + '\"') + closeLine
output.write(line)
if row['AUTC'] != '':
line = triple(e21placeHolder,
foafCoords.prefix + 'familyName',
'\"' + row['AUTC'] + '\"') + closeLine
output.write(line)
if row['AUTO'] != '':
line = triple(e21placeHolder,
foafCoords.prefix + 'givenName',
'\"' + row['AUTO'] + '\"') + closeLine
output.write(line)
if row['AUTZ'] != '':
line = triple(e21placeHolder,
foafCoords.prefix + 'gender',
'\"' + row['AUTZ'] + '\"') + closeLine
output.write(line)
line = triple(e21placeHolder, rdfsCoords.prefix + 'label',
'\"' + row['AUTN'] + ', ' + row['AUTA'] + '\"') + closeLine
output.write(line)
line = triple(e21placeHolder, cidocCoords.prefix + 'P3_has_note',
e62placeHolder) + closeLine
output.write(line)
line = triple(e62placeHolder, nsCoords.prefix + 'type',
cidocCoords.prefix + 'E62_String') + closeLine
output.write(line)
line = triple(e62placeHolder, rdfsCoords.prefix + 'label',
'\"Fonte: Museo di Palazzo Pretorio - Collezione Martini\"') + closeLine
output.write(line)
#E21 - P1 - E42
line = triple(e21placeHolder, cidocCoords.prefix + 'P1_is_identified_by',
e42placeHolder) + closeLine
output.write(line)
line = triple(e42placeHolder, nsCoords.prefix + 'type',
cidocCoords.prefix + 'E42_Identifier') + closeLine
output.write(line)
line = triple(e42placeHolder, rdfsCoords.prefix + 'label',
'\"' + row['AUTH'] + '\"') + closeLine
output.write(line)
#E21 - P1 - E41
'''line = triple(e21placeHolder, cidocCoords.prefix + 'P1_is_identified_by',
e41placeHolder) + closeLine
output.write(line)
line = triple(e41placeHolder, nsCoords.prefix + 'type',
cidocCoords.prefix + 'E41_Appellation') + closeLine
output.write(line)
line = triple(e41placeHolder, rdfsCoords.prefix + 'label',
'\"' + row['AUTN'] + '\"') + closeLine
output.write(line)'''
# E21 - P107i - E74
if row['AUTU'] != '':
group = []
if '/' in row['AUTU']:
group = row['AUTU'].split('/')
else:
group.append(row['AUTU'])
for gr in group:
gg = gr.replace(' ', '')
e74placeHolder = museoCoords.prefix + gg
line = triple(e21placeHolder,
cidocCoords.prefix + 'P107i_is_current_or_former_member_of',
e74placeHolder) + closeLine
output.write(line)
line = triple(e74placeHolder,
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E74_Group') + closeLine
output.write(line)
line = triple(e74placeHolder,
rdfsCoords.prefix + 'label',
'\"' + row['AUTU'] + '\"') + closeLine
output.write(line)
#E21 - P98i - E67
line = triple(e21placeHolder,
cidocCoords.prefix + 'P98i_was_born',
e67placeHolder) + closeLine
output.write(line)
line = triple(e67placeHolder,
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E67_Birth') + closeLine
output.write(line)
line = triple(e67placeHolder,
rdfsCoords.prefix + 'label',
'\"Nascita di ' + row['AUTN'] + '\"') + closeLine
output.write(line)
line = triple(e21placeHolder,
cidocCoords.prefix + 'P100i_died_in',
e69placeHolder) + closeLine
output.write(line)
line = triple(e69placeHolder,
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E69_Death') + closeLine
output.write(line)
line = triple(e69placeHolder,
rdfsCoords.prefix + 'label',
'\"Morte di ' + row['AUTN'] + '\"') + closeLine
output.write(line)
#E67 - P7 - E53
if row['AUTL'] != '':
line = triple(e67placeHolder,
cidocCoords.prefix + 'P7_took_place_at',
museoCoords.prefix + row['AUTL']) + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTL'],
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E53_Place') + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTL'],
rdfsCoords.prefix + 'label',
'\"' + row['AUTL'] + '\"') + closeLine
output.write(line)
# E67 - P4 - E52
if row['AUTD'] != '':
tt = row['AUTD'].replace(' ', '')
tim = tt.replace('/', '')
time = tim.replace('.', '')
line = triple(e67placeHolder,
cidocCoords.prefix + 'P4_has_time-span',
museoCoords.prefix + time) + closeLine
output.write(line)
line = triple(museoCoords.prefix + time,
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E52_Time-Span') + closeLine
output.write(line)
line = triple(museoCoords.prefix + time,
rdfsCoords.prefix + 'label',
'\"' + row['AUTD'] + '\"') + closeLine
output.write(line)
# E69 - P7 - E53
if row['AUTX'] != '':
line = triple(e69placeHolder,
cidocCoords.prefix + 'P7_took_place_at',
museoCoords.prefix + row['AUTX']) + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTX'],
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E53_Place') + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTX'],
rdfsCoords.prefix + 'label',
'\"' + row['AUTX'] + '\"') + closeLine
output.write(line)
# E69 - P4 - E52
if row['AUTT'] != '':
tt = row['AUTT'].replace(' ', '')
tim = tt.replace('/', '')
time = tim.replace('.', '')
line = triple(e69placeHolder,
cidocCoords.prefix + 'P4_has_time-span',
museoCoords.prefix + time) + closeLine
output.write(line)
line = triple(museoCoords.prefix + time,
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E52_Time-Span') + closeLine
output.write(line)
line = triple(museoCoords.prefix + time,
rdfsCoords.prefix + 'label',
'\"' + row['AUTT'] + '\"') + closeLine
output.write(line)
# E21 - occupation
if row['AUTQ'] != '':
line = triple(e21placeHolder,
schemaCoords.prefix + 'hasOccupation',
museoCoords.prefix + row['AUTQ']) + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTQ'],
nsCoords.prefix + 'type',
schemaCoords.prefix + 'Occupation') + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTQ'],
rdfsCoords.prefix + 'label',
'\"' + row['AUTQ'] + '\"') + closeLine
output.write(line)
# E21 - P139 - E41
if row['AUTV'] != '':
autv = []
if '/' in row['AUTV']:
autv = row['AUTV'].split('/')
else:
autv.append(row['AUTV'])
autvplaceHolder = museoCoords.prefix + row['AUTV'].replace(' ', '-').replace('\'', '')
line = triple(e21placeHolder,
cidocCoords.prefix + 'P139_has_alternative-form',
'\"' + row['AUTV'] + '\"') + closeLine
output.write(line)
# E21 - P139 - E41
# if row['AUTP'] != '':
# autv = []
# if '/' in row['AUTP']:
# autv = row['AUTP'].split('/')
# else:
# autv.append(row['AUTP'])
# line = triple(e21placeHolder,
# schemaCoords.prefix + 'alternateName',
# '\"' + row['AUTP'] + '\"') + closeLine
#output.write(line)
output.write('\n')
#
#
# Limit number of entries processed (if desired)
if (ii > max_entries):
break