# Utilities to read/write csv files import csv # Utilities to handle character encodings import unicodedata # Ordered Dicts from collections import OrderedDict from http.cookiejar import CookieJar from urllib.request import urlopen #from bs4 import BeautifulSoup import urllib import json from socket import error as SocketError import html.parser # OPZIONAL IMPORTS # For timestamping/simple speed tests from datetime import datetime # Random number generator from random import * # System & command line utilities import sys # Json for the dictionary import json import_dir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/MPP/CSV/corretti/' export_dir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/MPP/RDF/' # Custom class to store URIs + related infos for the ontologies/repositories class RDFcoords: def __init__(self, uri, prefix, code=None): self.uri = uri self.prefix = prefix self.code = code # Repositories museoCoords = RDFcoords('', 'mpp:') autCoords = RDFcoords('', 'aut:') cidocCoords = RDFcoords('', 'crm:') aatCoords = RDFcoords('', 'aat:') nsCoords = RDFcoords('', 'rdf:') schemaCoords = RDFcoords('', 'rdfs:') xsdCoords = RDFcoords('', 'xsd:') iconCoords = RDFcoords('', 'ico:') documentsCoords = RDFcoords('', 'ds:') rdfsCoords = RDFcoords('', 'rdfs:') # Basic functions for triples / shortened triples in TTL format def triple(subject, predicate, object1): line = subject + ' ' + predicate + ' ' + object1 return line def doublet(predicate, object1): line = ' ' + predicate + ' ' + object1 return line def singlet(object1): line = ' ' + object1 return line # Line endings in TTL format continueLine1 = ' ;\n' continueLine2 = ' ,\n' closeLine = ' .\n' def writeTTLHeader(output): output.write('@prefix ' + museoCoords.prefix + ' ' + museoCoords.uri + closeLine) output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine) output.write('@prefix ' + aatCoords.prefix + ' ' + aatCoords.uri + closeLine) output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine) output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine) output.write('@prefix ' + autCoords.prefix + ' ' + autCoords.uri + closeLine) output.write('@prefix ' + xsdCoords.prefix + ' ' + xsdCoords.uri + closeLine) output.write('@prefix ' + iconCoords.prefix + ' ' + iconCoords.uri + closeLine) output.write('@prefix ' + documentsCoords.prefix + ' ' + documentsCoords.uri + closeLine) output.write('@prefix ' + rdfsCoords.prefix + ' ' + rdfsCoords.uri + closeLine) output.write('\n') filePrefix = 'OA_Data_' fileType = 'Datini' max_entries = 1000000000 with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file, open( export_dir + filePrefix + fileType + '_BIB.ttl', 'w') as output: reader = csv.DictReader(csv_file) writeTTLHeader(output) first = True ii = 0 for row in reader: # The index ii is used to process a limited number of entries for testing purposes ii = ii + 1 # columnName = list(row) url = row['URL'] pp = row['NCTN'] # placeHolders # if row['BIBH']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH1']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN1'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH2']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN2'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH3']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN3'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH4']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN4'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH5']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN5'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH6']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN6'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type', cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH7']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN7'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type', cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH8']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN8'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH9']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN9'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH10']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN10'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH11']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN11'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH12']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN12'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH13']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN13'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # if row['BIBH14']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN14'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # output.write('\n') # # if row['FTAN']!= '': # e73placeHolder = "" # e73placeHolderED = "" # e73placeHolderEDPP = "" # line = triple(e73placeHolderEDPP, documentsCoords.prefix, e73placeHolder) + closeLine # output.write(line) # line = triple(e73placeHolderED, cidocCoords.prefix + "P46_is_composed_of", e73placeHolderEDPP) + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, cidocCoords.prefix + 'P3_has_note', '\"' + row['BIBN'] + '\"') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, nsCoords.prefix + 'type',cidocCoords.prefix + 'E73_Information_Object') + closeLine # output.write(line) # line = triple(e73placeHolderEDPP, # rdfsCoords.prefix + 'label', # '\"Bibliografia specifica\"') + closeLine # output.write(line) # # Limit number of entries processed (if desired) if (ii > max_entries): break