CSV_to_RDF_mpp.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. # Utilities to read/write csv files
  2. import csv
  3. # Utilities to handle character encodings
  4. import unicodedata
  5. # Ordered Dicts
  6. from collections import OrderedDict
  7. import json
  8. # OPZIONAL IMPORTS
  9. # For timestamping/simple speed tests
  10. from datetime import datetime
  11. # Random number generator
  12. from random import *
  13. # System & command line utilities
  14. import sys
  15. # Json for the dictionary
  16. import json
  17. import_dir = '/Users/alessiaspadi/Documents/RESTORE/temp_MPP/tabelle/Ospedale/mod/'
  18. export_dir = '/Users/alessiaspadi/Documents/RESTORE/temp_MPP/tabelle/Ospedale/mod/'
  19. # Custom class to store URIs + related infos for the ontologies/repositories
  20. class RDFcoords:
  21. def __init__(self, uri, prefix, code=None):
  22. self.uri = uri
  23. self.prefix = prefix
  24. self.code = code
  25. # Repositories
  26. museoCoords = RDFcoords('<http://palazzopretorio.comune.prato.it/it/le-opere/alcuni-capolavori/>', 'mpp:')
  27. cidocCoords = RDFcoords('<http://www.cidoc-crm.org/cidoc-crm/>', 'crm:')
  28. aatCoords = RDFcoords('<http://vocab.getty.edu/aat/>', 'aat:')
  29. nsCoords = RDFcoords('<http://www.w3.org/1999/02/22-rdf-syntax-ns#>', 'rdf:')
  30. schemaCoords = RDFcoords('<http://www.w3.org/2000/01/rdf-schema#>', 'rdfs:')
  31. # Basic functions for triples / shortened triples in TTL format
  32. def triple(subject, predicate, object1):
  33. line = subject + ' ' + predicate + ' ' + object1
  34. return line
  35. def doublet(predicate, object1):
  36. line = ' ' + predicate + ' ' + object1
  37. return line
  38. def singlet(object1):
  39. line = ' ' + object1
  40. return line
  41. # Line endings in TTL format
  42. continueLine1 = ' ;\n'
  43. continueLine2 = ' ,\n'
  44. closeLine = ' .\n'
  45. def writeTTLHeader(output):
  46. output.write('@prefix ' + museoCoords.prefix + ' ' + museoCoords.uri + closeLine)
  47. output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine)
  48. output.write('@prefix ' + aatCoords.prefix + ' ' + aatCoords.uri + closeLine)
  49. output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine)
  50. output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine)
  51. output.write('\n')
  52. filePrefix = 'SR20OA_'
  53. fileType = 'Ospedale'
  54. max_entries = 1000000000
  55. with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file, open(
  56. export_dir + filePrefix + fileType + '.ttl', 'w') as output:
  57. reader = csv.DictReader(csv_file)
  58. writeTTLHeader(output)
  59. first = True
  60. ii = 0
  61. for row in reader:
  62. # The index ii is used to process a limited number of entries for testing purposes
  63. ii = ii + 1
  64. if row['RVEL'] == '' or row['RVEL'] == '0':
  65. sb = ''
  66. subj = ''
  67. pp = row['OGTD'] + ' (' + row['ACC'] + ') '
  68. if row['SGTI'] != '':
  69. sb = pp + row['SGTI']
  70. if row['LDCN'] != '':
  71. subj = sb + ' in ' + row['LDCN']
  72. else:
  73. subj = sb
  74. # Triplify the 'codice' -- should exist for every entry
  75. codice = ''
  76. if (row['NCTR'] != '' and row['NCTN'] != ''):
  77. codice = row['NCTR'] + row['NCTN']
  78. '''if (row['RVEL'] != ' '):
  79. codice = codice + "-" + row['RVEL']'''
  80. url = row['URL']
  81. # placeHolders
  82. datplaceHolder = museoCoords.prefix + url
  83. e42placeHolder = museoCoords.prefix + url + '_E42'
  84. e42CplaceHolder = museoCoords.prefix + url + '_E42_MPP'
  85. e73placeHolder = museoCoords.prefix + url + '_E73'
  86. e55placeHolder = museoCoords.prefix + url + '_E55'
  87. e35placeHolder1 = museoCoords.prefix + url + '_E35'
  88. e53placeHolder = museoCoords.prefix + url + '_E53'
  89. e1placeHolder = museoCoords.prefix + url + '_E1'
  90. e74placeHolder = museoCoords.prefix + url + '_E74'
  91. if (codice != ''):
  92. line = triple(datplaceHolder, cidocCoords.prefix + 'P1_is_identified_by', e42placeHolder) + closeLine
  93. output.write(line)
  94. line = triple(e42placeHolder, nsCoords.prefix + 'type',
  95. cidocCoords.prefix + 'E42_Identifier') + closeLine
  96. output.write(line)
  97. '''line = triple(e42placeHolder, cidocCoords.prefix + 'P2_has_type',
  98. aatCoords.prefix + '300404626') + closeLine
  99. output.write(line)
  100. line = triple(aatCoords.prefix + '300404626', schemaCoords.prefix + 'label',
  101. "identifier") + closeLine
  102. output.write(line)'''
  103. line = triple(e42placeHolder, schemaCoords.prefix + 'label',
  104. '\"Codice univoco del bene: ' + codice + '\"') + closeLine
  105. output.write(line)
  106. '''
  107. # AS
  108. e55placeHolder = "<http://www.museodipalazzopretorio.it/" + codice + '/' + identifierCoords.code + ">"
  109. line = triple(e42placeHolder, hasTypePCoords.prefix, e55placeHolder) + closeLine
  110. output.write(line)
  111. line = triple(e55placeHolder, hasTypeCoords.prefix, typeCoords.prefix) + closeLine
  112. output.write(line)
  113. line = triple(e55placeHolder, labelCoords.prefix, '\"Codice univoco del bene\"') + closeLine
  114. output.write(line)
  115. # Fine AS
  116. '''
  117. # Write E22 Man Made Object & E73 Information Object -- should exist for every entry?
  118. line = triple(datplaceHolder, nsCoords.prefix + 'type',
  119. cidocCoords.prefix + 'E22_Man-Made_Object') + closeLine
  120. output.write(line)
  121. # Added by AS
  122. line = triple(datplaceHolder, schemaCoords.prefix + 'label', '\"' + subj + '\"') + closeLine
  123. output.write(line)
  124. # End AS
  125. line = triple(datplaceHolder, cidocCoords.prefix + 'P128_carries', e73placeHolder) + closeLine
  126. output.write(line)
  127. line = triple(e73placeHolder, nsCoords.prefix + 'type',
  128. cidocCoords.prefix + 'E73_Information_Object') + closeLine
  129. output.write(line)
  130. # AS
  131. ss = ''
  132. if row['SGTI'] != '':
  133. ss = row['SGTI']
  134. else:
  135. ss = 'senza titolo'
  136. line = triple(e73placeHolder, schemaCoords.prefix + 'label',
  137. '\"Opera d\'arte raffigurante ' + ss + '\"') + closeLine
  138. output.write(line)
  139. # E73 - P2 - E55
  140. tt = ''
  141. typeLabel = ''
  142. if row['OGTD'] == 'dipinto':
  143. tt = aatCoords.prefix + "300033618"
  144. elif row['OGTD'] == 'rilievo':
  145. tt = aatCoords.prefix + "300047230"
  146. elif row['OGTD'] == 'polittico':
  147. tt = aatCoords.prefix + "300178235"
  148. elif row['OGTD'] == 'predella':
  149. tt = aatCoords.prefix + "300003745"
  150. else:
  151. tt = e55placeHolder
  152. line = triple(e73placeHolder,
  153. cidocCoords.prefix + 'P2_has_type',
  154. tt) + closeLine
  155. output.write(line)
  156. line = triple(tt, schemaCoords.prefix + 'label',
  157. '\"' + row['OGTD'] + '\"') + closeLine
  158. output.write(line)
  159. # E73 - P1 - E35
  160. if row['SGTT'] != '':
  161. line = triple(e73placeHolder, cidocCoords.prefix + 'P1_is_identified_by', e35placeHolder1) + closeLine
  162. output.write(line)
  163. line = triple(e35placeHolder1, nsCoords.prefix + 'type', cidocCoords.prefix + 'E35_Title') + closeLine
  164. output.write(line)
  165. line = triple(e35placeHolder1, schemaCoords.prefix + 'label', '\"' + row['SGTT'] + '\"') + closeLine
  166. output.write(line)
  167. line = triple(e35placeHolder1, cidocCoords.prefix + 'P2_has_type',
  168. aatCoords.prefix + "300417193") + closeLine
  169. output.write(line)
  170. line = triple(aatCoords.prefix + "300417193", schemaCoords.prefix + 'label',
  171. '\"titolo\"') + closeLine
  172. output.write(line)
  173. # E22 - P62 - E1
  174. if row['SGTI'] != '':
  175. line = triple(datplaceHolder,
  176. cidocCoords.prefix + 'P62_depicts',
  177. e1placeHolder) + closeLine
  178. output.write(line)
  179. line = triple(e1placeHolder,
  180. nsCoords.prefix + 'type',
  181. cidocCoords.prefix + 'E1_CRM_Entity') + closeLine
  182. output.write(line)
  183. line = triple(e1placeHolder,
  184. schemaCoords.prefix + 'label', '\"' +
  185. row['SGTI'] + '\"') + closeLine
  186. output.write(line)
  187. # doppio titolo - se usato cancellare E73 -E35
  188. '''if row['SGTI'] != 'None' and row['SGTI'] != ' ':
  189. line = triple(e73placeHolder, identifiedByCoords.prefix, "<http://www.museodipalazzopretorio.it/" + codice + "/sgti>") + closeLine
  190. output.write(line)
  191. line = triple("<http://www.museodipalazzopretorio.it/" + codice + "/sgti>",
  192. hasTypePCoords.prefix,
  193. "<http://www.museodipalazzopretorio.it/" + codice + "/ico>") + closeLine
  194. output.write(line)
  195. line = triple("<http://www.museodipalazzopretorio.it/" + codice + "/sgti>",
  196. labelCoords.prefix,
  197. '\"' + row['SGTI'] + '\"') + closeLine
  198. output.write(line)
  199. line = triple("<http://www.museodipalazzopretorio.it/" + codice + "/ico>",
  200. labelCoords.prefix,
  201. '\"' + 'Identificazione Iconografica' + '\"') + closeLine
  202. output.write(line)
  203. if row['SGTT'] != ' ':
  204. line = triple(e73placeHolder,
  205. identifiedByCoords.prefix,
  206. "<http://www.museodipalazzopretorio.it/" + codice + "/sgtt>") + closeLine
  207. output.write(line)
  208. line = triple("<http://www.museodipalazzopretorio.it/" + codice + "/sgtt>",
  209. hasTypePCoords.prefix,
  210. "<http://www.museodipalazzopretorio.it/" + codice + "/titolo>") + closeLine
  211. output.write(line)
  212. line = triple("<http://www.museodipalazzopretorio.it/" + codice + "/sgtt>",
  213. labelCoords.prefix,
  214. '\"' + row['SGTT'] + '\"') + closeLine
  215. output.write(line)
  216. line = triple("<http://www.museodipalazzopretorio.it/" + codice + "/titolo>",
  217. labelCoords.prefix,
  218. '\"' + 'Titolo' + '\"') + closeLine
  219. output.write(line)'''
  220. # Fine doppio titolo
  221. # Attention: these triples are identified only for C100005 Museo di Palazzo Pretorio
  222. if row['ESC'] == 'C100005':
  223. line = triple(datplaceHolder,
  224. cidocCoords.prefix + 'P52_has_current_owner',
  225. e74placeHolder) + closeLine
  226. output.write(line)
  227. line = triple(e74placeHolder,
  228. nsCoords.prefix + 'type',
  229. cidocCoords.prefix + 'E74_Group') + closeLine
  230. output.write(line)
  231. line = triple(e74placeHolder,
  232. schemaCoords.prefix + 'label',
  233. '\"Museo di Palazzo Pretorio\"') + closeLine
  234. output.write(line)
  235. line = triple(e74placeHolder,
  236. cidocCoords.prefix + 'P1_is_identified_by',
  237. e42CplaceHolder) + closeLine
  238. output.write(line)
  239. line = triple(e42CplaceHolder,
  240. nsCoords.prefix + 'type',
  241. cidocCoords.prefix + 'E42_Identifier') + closeLine
  242. output.write(line)
  243. line = triple(e42CplaceHolder,
  244. schemaCoords.prefix + 'label',
  245. '\"' + row['ESC'] + '\"') + closeLine
  246. output.write(line)
  247. line = triple(e42CplaceHolder,
  248. cidocCoords.prefix + 'P2_has_type',
  249. aatCoords.prefix + '300404626') + closeLine
  250. output.write(line)
  251. line = triple(aatCoords.prefix + '300404626',
  252. schemaCoords.prefix + 'label',
  253. '\"identificatore numerico\"') + closeLine
  254. output.write(line)
  255. currentLocation = ''
  256. # E22 - P54 - E53
  257. if row['LDCN'] != '':
  258. if row['LDCS'] != '':
  259. currentLocation = row['LDCS']
  260. else:
  261. currentLocation = currentLocation
  262. if row['LDCM'] != '':
  263. currentLocation = currentLocation + ', ' + row['LDCM']
  264. else:
  265. currentLocation = currentLocation
  266. if row['LDCN'] != '':
  267. currentLocation = currentLocation + ', ' + row['LDCN']
  268. else:
  269. currentLocation = currentLocation
  270. currentLocation = currentLocation + ', ' + row['PVCC'] + ' (' + row['PVCP'] + ')'
  271. line = triple(datplaceHolder, cidocCoords.prefix + 'P54_has_current_permanent_location',
  272. e53placeHolder) + closeLine
  273. output.write(line)
  274. line = triple(e53placeHolder, nsCoords.prefix + 'type', cidocCoords.prefix + 'E53_Place') + closeLine
  275. output.write(line)
  276. line = triple(e53placeHolder, schemaCoords.prefix + 'label',
  277. '\"' + currentLocation + '\"') + closeLine
  278. output.write(line)
  279. # End AS
  280. output.write('\n')
  281. #
  282. #
  283. # Limit number of entries processed (if desired)
  284. if (ii > max_entries):
  285. break