## IMPORTS
# Utilities to read/write csv files
import csv
# Directories
import_dir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/MPP/CSV/corretti/'
export_dir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/MPP/RDF/'
# Custom class to store URIs + related infos for the ontologies/repositories
class RDFcoords:
def __init__(self, uri, prefix, code = None):
self.uri = uri
self.prefix = prefix
self.code = code
# Repositories
museoCoords = RDFcoords('', 'mpp:')
autCoords = RDFcoords('', 'aut:')
foafCoords = RDFcoords('', 'foaf:')
cidocCoords = RDFcoords('', 'crm:')
aatCoords = RDFcoords('', 'aat:')
nsCoords = RDFcoords('', 'rdf:')
schemaCoords = RDFcoords('', 'schema:')
rdfsCoords = RDFcoords('', 'rdfs:')
# Basic utilities to format triples / shortened triples in TTL format
#
# Format full triple
def triple(subject, predicate, object1):
line = subject + ' ' + predicate + ' ' + object1
return line
# Format entry in predicate list (no subject)
def doublet(predicate, object1):
line = ' ' + predicate + ' ' + object1
return line
# Format entry in object list (object only)
def singlet(object1):
line = ' ' + object1
return line
# Line endings
continueLine1 = ' ;\n' # Before a predicate list, that is if the FOLLOWING triple has the same subject
continueLine2 = ' ,\n' # Before an object list, that is if the FOLLOWING triple has the same subject and predicate
closeLine = ' .\n' # To end a triple / a triples block
def writeTTLHeader(output):
output.write('@prefix ' + museoCoords.prefix + ' ' + museoCoords.uri + closeLine)
output.write('@prefix ' + foafCoords.prefix + ' ' + foafCoords.uri + closeLine)
output.write('@prefix ' + autCoords.prefix + ' ' + autCoords.uri + closeLine)
output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine)
output.write('@prefix ' + aatCoords.prefix + ' ' + aatCoords.uri + closeLine)
output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine)
output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine)
output.write('@prefix ' + rdfsCoords.prefix + ' ' + rdfsCoords.uri + closeLine)
output.write('\n')
filePrefix = 'AR20AUT_'
fileType = 'Datini'
max_entries = None
def parsefromfile(infile, outfile):
pyppa = infile.decode()
csv_dicts = [{k: v for k, v in row.items()} for row in csv.DictReader(pyppa.splitlines(), skipinitialspace=True)]
parse(csv_dicts, outfile)
def parse(dict_list, outfile):
with open(outfile, 'w') as output:
writeTTLHeader(output)
first = True # In case something needs processing only once for the whole CSV input
for ii, row in enumerate(dict_list):
# The index ii is mainly used to limit the number of entries to process, for testing purposes
url = row['URL']
#placeHolders
e21placeHolder = autCoords.prefix + url
e62placeHolder = autCoords.prefix + url + '_E62'
e42placeHolder = autCoords.prefix + row['AUTH']
e67placeHolder = autCoords.prefix + url + '_E67'
e69placeHolder = autCoords.prefix + url + '_E69'
line = triple(e21placeHolder, nsCoords.prefix + 'type',
cidocCoords.prefix + 'E21_Person') + closeLine
output.write(line)
line = triple(e21placeHolder,
nsCoords.prefix + 'type',
foafCoords.prefix + 'person') + closeLine
output.write(line)
if row['AUTN'] != '':
line = triple(e21placeHolder,
foafCoords.prefix + 'name',
'\"' + row['AUTN'] + '\"') + closeLine
output.write(line)
if row['AUTC'] != '':
line = triple(e21placeHolder,
foafCoords.prefix + 'familyName',
'\"' + row['AUTC'] + '\"') + closeLine
output.write(line)
if row['AUTO'] != '':
line = triple(e21placeHolder,
foafCoords.prefix + 'givenName',
'\"' + row['AUTO'] + '\"') + closeLine
output.write(line)
if row['AUTZ'] != '':
line = triple(e21placeHolder,
foafCoords.prefix + 'gender',
'\"' + row['AUTZ'] + '\"') + closeLine
output.write(line)
line = triple(e21placeHolder, rdfsCoords.prefix + 'label',
'\"' + row['AUTN'] + ', ' + row['AUTA'] + '\"') + closeLine
output.write(line)
line = triple(e21placeHolder, cidocCoords.prefix + 'P3_has_note',
e62placeHolder) + closeLine
output.write(line)
line = triple(e62placeHolder, nsCoords.prefix + 'type',
cidocCoords.prefix + 'E62_String') + closeLine
output.write(line)
line = triple(e62placeHolder, rdfsCoords.prefix + 'label',
'\"Fonte: Museo di Palazzo Pretorio - Collezione Martini\"') + closeLine
output.write(line)
#E21 - P1 - E42
line = triple(e21placeHolder, cidocCoords.prefix + 'P1_is_identified_by',
e42placeHolder) + closeLine
output.write(line)
line = triple(e42placeHolder, nsCoords.prefix + 'type',
cidocCoords.prefix + 'E42_Identifier') + closeLine
output.write(line)
line = triple(e42placeHolder, rdfsCoords.prefix + 'label',
'\"' + row['AUTH'] + '\"') + closeLine
output.write(line)
# E21 - P107i - E74
if row['AUTU'] != '':
group = []
if '/' in row['AUTU']:
group = row['AUTU'].split('/')
else:
group.append(row['AUTU'])
for gr in group:
gg = gr.replace(' ', '')
e74placeHolder = museoCoords.prefix + gg
line = triple(e21placeHolder,
cidocCoords.prefix + 'P107i_is_current_or_former_member_of',
e74placeHolder) + closeLine
output.write(line)
line = triple(e74placeHolder,
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E74_Group') + closeLine
output.write(line)
line = triple(e74placeHolder,
rdfsCoords.prefix + 'label',
'\"' + row['AUTU'] + '\"') + closeLine
output.write(line)
#E21 - P98i - E67
line = triple(e21placeHolder,
cidocCoords.prefix + 'P98i_was_born',
e67placeHolder) + closeLine
output.write(line)
line = triple(e67placeHolder,
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E67_Birth') + closeLine
output.write(line)
line = triple(e67placeHolder,
rdfsCoords.prefix + 'label',
'\"Nascita di ' + row['AUTN'] + '\"') + closeLine
output.write(line)
line = triple(e21placeHolder,
cidocCoords.prefix + 'P100i_died_in',
e69placeHolder) + closeLine
output.write(line)
line = triple(e69placeHolder,
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E69_Death') + closeLine
output.write(line)
line = triple(e69placeHolder,
rdfsCoords.prefix + 'label',
'\"Morte di ' + row['AUTN'] + '\"') + closeLine
output.write(line)
#E67 - P7 - E53
if row['AUTL'] != '':
line = triple(e67placeHolder,
cidocCoords.prefix + 'P7_took_place_at',
museoCoords.prefix + row['AUTL']) + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTL'],
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E53_Place') + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTL'],
rdfsCoords.prefix + 'label',
'\"' + row['AUTL'] + '\"') + closeLine
output.write(line)
# E67 - P4 - E52
if row['AUTD'] != '':
tt = row['AUTD'].replace(' ', '')
tim = tt.replace('/', '')
time = tim.replace('.', '')
line = triple(e67placeHolder,
cidocCoords.prefix + 'P4_has_time-span',
museoCoords.prefix + time) + closeLine
output.write(line)
line = triple(museoCoords.prefix + time,
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E52_Time-Span') + closeLine
output.write(line)
line = triple(museoCoords.prefix + time,
rdfsCoords.prefix + 'label',
'\"' + row['AUTD'] + '\"') + closeLine
output.write(line)
# E69 - P7 - E53
if row['AUTX'] != '':
line = triple(e69placeHolder,
cidocCoords.prefix + 'P7_took_place_at',
museoCoords.prefix + row['AUTX']) + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTX'],
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E53_Place') + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTX'],
rdfsCoords.prefix + 'label',
'\"' + row['AUTX'] + '\"') + closeLine
output.write(line)
# E69 - P4 - E52
if row['AUTT'] != '':
tt = row['AUTT'].replace(' ', '')
tim = tt.replace('/', '')
time = tim.replace('.', '')
line = triple(e69placeHolder,
cidocCoords.prefix + 'P4_has_time-span',
museoCoords.prefix + time) + closeLine
output.write(line)
line = triple(museoCoords.prefix + time,
nsCoords.prefix + 'type',
cidocCoords.prefix + 'E52_Time-Span') + closeLine
output.write(line)
line = triple(museoCoords.prefix + time,
rdfsCoords.prefix + 'label',
'\"' + row['AUTT'] + '\"') + closeLine
output.write(line)
# E21 - occupation
if row['AUTQ'] != '':
line = triple(e21placeHolder,
schemaCoords.prefix + 'hasOccupation',
museoCoords.prefix + row['AUTQ']) + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTQ'],
nsCoords.prefix + 'type',
schemaCoords.prefix + 'Occupation') + closeLine
output.write(line)
line = triple(museoCoords.prefix + row['AUTQ'],
rdfsCoords.prefix + 'label',
'\"' + row['AUTQ'] + '\"') + closeLine
output.write(line)
# E21 - P139 - E41
if row['AUTV'] != '':
autv = []
if '/' in row['AUTV']:
autv = row['AUTV'].split('/')
else:
autv.append(row['AUTV'])
autvplaceHolder = museoCoords.prefix + row['AUTV'].replace(' ', '-').replace('\'', '')
line = triple(e21placeHolder,
cidocCoords.prefix + 'P139_has_alternative-form',
'\"' + row['AUTV'] + '\"') + closeLine
output.write(line)
output.write('\n')
#
#
# To limit number of entries processed (if desired for testing purposes)
if (max_entries is not None and ii > max_entries):
break