CSV_to_RDF_Autori.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. ## IMPORTS
  2. # Utilities to read/write csv files
  3. import csv
  4. # Directories
  5. import_dir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/MPP/CSV/corretti/'
  6. export_dir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/MPP/RDF/'
  7. # Custom class to store URIs + related infos for the ontologies/repositories
  8. class RDFcoords:
  9. def __init__(self, uri, prefix, code = None):
  10. self.uri = uri
  11. self.prefix = prefix
  12. self.code = code
  13. # Repositories
  14. museoCoords = RDFcoords('<https://palazzopretorio.prato.it/it/le-opere/alcuni-capolavori/>', 'mpp:')
  15. autCoords = RDFcoords('<https://palazzopretorio.prato.it/it/opere/autori/>', 'aut:')
  16. foafCoords = RDFcoords('<http://xmlns.com/foaf/0.1/>', 'foaf:')
  17. cidocCoords = RDFcoords('<http://www.cidoc-crm.org/cidoc-crm/>', 'crm:')
  18. aatCoords = RDFcoords('<http://vocab.getty.edu/aat/>', 'aat:')
  19. nsCoords = RDFcoords('<http://www.w3.org/1999/02/22-rdf-syntax-ns#>', 'rdf:')
  20. schemaCoords = RDFcoords('<http://www.schema.org/>', 'schema:')
  21. rdfsCoords = RDFcoords('<http://www.w3.org/2000/01/rdf-schema#>', 'rdfs:')
  22. # Basic utilities to format triples / shortened triples in TTL format
  23. #
  24. # Format full triple
  25. def triple(subject, predicate, object1):
  26. line = subject + ' ' + predicate + ' ' + object1
  27. return line
  28. # Format entry in predicate list (no subject)
  29. def doublet(predicate, object1):
  30. line = ' ' + predicate + ' ' + object1
  31. return line
  32. # Format entry in object list (object only)
  33. def singlet(object1):
  34. line = ' ' + object1
  35. return line
  36. # Line endings
  37. continueLine1 = ' ;\n' # Before a predicate list, that is if the FOLLOWING triple has the same subject
  38. continueLine2 = ' ,\n' # Before an object list, that is if the FOLLOWING triple has the same subject and predicate
  39. closeLine = ' .\n' # To end a triple / a triples block
  40. def writeTTLHeader(output):
  41. output.write('@prefix ' + museoCoords.prefix + ' ' + museoCoords.uri + closeLine)
  42. output.write('@prefix ' + foafCoords.prefix + ' ' + foafCoords.uri + closeLine)
  43. output.write('@prefix ' + autCoords.prefix + ' ' + autCoords.uri + closeLine)
  44. output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine)
  45. output.write('@prefix ' + aatCoords.prefix + ' ' + aatCoords.uri + closeLine)
  46. output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine)
  47. output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine)
  48. output.write('@prefix ' + rdfsCoords.prefix + ' ' + rdfsCoords.uri + closeLine)
  49. output.write('\n')
  50. filePrefix = 'AR20AUT_'
  51. fileType = 'Datini'
  52. max_entries = None
  53. def parsefromfile(infile, outfile):
  54. pyppa = infile.decode()
  55. csv_dicts = [{k: v for k, v in row.items()} for row in csv.DictReader(pyppa.splitlines(), skipinitialspace=True)]
  56. parse(csv_dicts, outfile)
  57. def parse(dict_list, outfile):
  58. with open(outfile, 'w') as output:
  59. writeTTLHeader(output)
  60. first = True # In case something needs processing only once for the whole CSV input
  61. for ii, row in enumerate(dict_list):
  62. # The index ii is mainly used to limit the number of entries to process, for testing purposes
  63. url = row['URL']
  64. #placeHolders
  65. e21placeHolder = autCoords.prefix + url
  66. e62placeHolder = autCoords.prefix + url + '_E62'
  67. e42placeHolder = autCoords.prefix + row['AUTH']
  68. e67placeHolder = autCoords.prefix + url + '_E67'
  69. e69placeHolder = autCoords.prefix + url + '_E69'
  70. line = triple(e21placeHolder, nsCoords.prefix + 'type',
  71. cidocCoords.prefix + 'E21_Person') + closeLine
  72. output.write(line)
  73. line = triple(e21placeHolder,
  74. nsCoords.prefix + 'type',
  75. foafCoords.prefix + 'person') + closeLine
  76. output.write(line)
  77. if row['AUTN'] != '':
  78. line = triple(e21placeHolder,
  79. foafCoords.prefix + 'name',
  80. '\"' + row['AUTN'] + '\"') + closeLine
  81. output.write(line)
  82. if row['AUTC'] != '':
  83. line = triple(e21placeHolder,
  84. foafCoords.prefix + 'familyName',
  85. '\"' + row['AUTC'] + '\"') + closeLine
  86. output.write(line)
  87. if row['AUTO'] != '':
  88. line = triple(e21placeHolder,
  89. foafCoords.prefix + 'givenName',
  90. '\"' + row['AUTO'] + '\"') + closeLine
  91. output.write(line)
  92. if row['AUTZ'] != '':
  93. line = triple(e21placeHolder,
  94. foafCoords.prefix + 'gender',
  95. '\"' + row['AUTZ'] + '\"') + closeLine
  96. output.write(line)
  97. line = triple(e21placeHolder, rdfsCoords.prefix + 'label',
  98. '\"' + row['AUTN'] + ', ' + row['AUTA'] + '\"') + closeLine
  99. output.write(line)
  100. line = triple(e21placeHolder, cidocCoords.prefix + 'P3_has_note',
  101. e62placeHolder) + closeLine
  102. output.write(line)
  103. line = triple(e62placeHolder, nsCoords.prefix + 'type',
  104. cidocCoords.prefix + 'E62_String') + closeLine
  105. output.write(line)
  106. line = triple(e62placeHolder, rdfsCoords.prefix + 'label',
  107. '\"Fonte: Museo di Palazzo Pretorio - Collezione Martini\"') + closeLine
  108. output.write(line)
  109. #E21 - P1 - E42
  110. line = triple(e21placeHolder, cidocCoords.prefix + 'P1_is_identified_by',
  111. e42placeHolder) + closeLine
  112. output.write(line)
  113. line = triple(e42placeHolder, nsCoords.prefix + 'type',
  114. cidocCoords.prefix + 'E42_Identifier') + closeLine
  115. output.write(line)
  116. line = triple(e42placeHolder, rdfsCoords.prefix + 'label',
  117. '\"' + row['AUTH'] + '\"') + closeLine
  118. output.write(line)
  119. # E21 - P107i - E74
  120. if row['AUTU'] != '':
  121. group = []
  122. if '/' in row['AUTU']:
  123. group = row['AUTU'].split('/')
  124. else:
  125. group.append(row['AUTU'])
  126. for gr in group:
  127. gg = gr.replace(' ', '')
  128. e74placeHolder = museoCoords.prefix + gg
  129. line = triple(e21placeHolder,
  130. cidocCoords.prefix + 'P107i_is_current_or_former_member_of',
  131. e74placeHolder) + closeLine
  132. output.write(line)
  133. line = triple(e74placeHolder,
  134. nsCoords.prefix + 'type',
  135. cidocCoords.prefix + 'E74_Group') + closeLine
  136. output.write(line)
  137. line = triple(e74placeHolder,
  138. rdfsCoords.prefix + 'label',
  139. '\"' + row['AUTU'] + '\"') + closeLine
  140. output.write(line)
  141. #E21 - P98i - E67
  142. line = triple(e21placeHolder,
  143. cidocCoords.prefix + 'P98i_was_born',
  144. e67placeHolder) + closeLine
  145. output.write(line)
  146. line = triple(e67placeHolder,
  147. nsCoords.prefix + 'type',
  148. cidocCoords.prefix + 'E67_Birth') + closeLine
  149. output.write(line)
  150. line = triple(e67placeHolder,
  151. rdfsCoords.prefix + 'label',
  152. '\"Nascita di ' + row['AUTN'] + '\"') + closeLine
  153. output.write(line)
  154. line = triple(e21placeHolder,
  155. cidocCoords.prefix + 'P100i_died_in',
  156. e69placeHolder) + closeLine
  157. output.write(line)
  158. line = triple(e69placeHolder,
  159. nsCoords.prefix + 'type',
  160. cidocCoords.prefix + 'E69_Death') + closeLine
  161. output.write(line)
  162. line = triple(e69placeHolder,
  163. rdfsCoords.prefix + 'label',
  164. '\"Morte di ' + row['AUTN'] + '\"') + closeLine
  165. output.write(line)
  166. #E67 - P7 - E53
  167. if row['AUTL'] != '':
  168. line = triple(e67placeHolder,
  169. cidocCoords.prefix + 'P7_took_place_at',
  170. museoCoords.prefix + row['AUTL']) + closeLine
  171. output.write(line)
  172. line = triple(museoCoords.prefix + row['AUTL'],
  173. nsCoords.prefix + 'type',
  174. cidocCoords.prefix + 'E53_Place') + closeLine
  175. output.write(line)
  176. line = triple(museoCoords.prefix + row['AUTL'],
  177. rdfsCoords.prefix + 'label',
  178. '\"' + row['AUTL'] + '\"') + closeLine
  179. output.write(line)
  180. # E67 - P4 - E52
  181. if row['AUTD'] != '':
  182. tt = row['AUTD'].replace(' ', '')
  183. tim = tt.replace('/', '')
  184. time = tim.replace('.', '')
  185. line = triple(e67placeHolder,
  186. cidocCoords.prefix + 'P4_has_time-span',
  187. museoCoords.prefix + time) + closeLine
  188. output.write(line)
  189. line = triple(museoCoords.prefix + time,
  190. nsCoords.prefix + 'type',
  191. cidocCoords.prefix + 'E52_Time-Span') + closeLine
  192. output.write(line)
  193. line = triple(museoCoords.prefix + time,
  194. rdfsCoords.prefix + 'label',
  195. '\"' + row['AUTD'] + '\"') + closeLine
  196. output.write(line)
  197. # E69 - P7 - E53
  198. if row['AUTX'] != '':
  199. line = triple(e69placeHolder,
  200. cidocCoords.prefix + 'P7_took_place_at',
  201. museoCoords.prefix + row['AUTX']) + closeLine
  202. output.write(line)
  203. line = triple(museoCoords.prefix + row['AUTX'],
  204. nsCoords.prefix + 'type',
  205. cidocCoords.prefix + 'E53_Place') + closeLine
  206. output.write(line)
  207. line = triple(museoCoords.prefix + row['AUTX'],
  208. rdfsCoords.prefix + 'label',
  209. '\"' + row['AUTX'] + '\"') + closeLine
  210. output.write(line)
  211. # E69 - P4 - E52
  212. if row['AUTT'] != '':
  213. tt = row['AUTT'].replace(' ', '')
  214. tim = tt.replace('/', '')
  215. time = tim.replace('.', '')
  216. line = triple(e69placeHolder,
  217. cidocCoords.prefix + 'P4_has_time-span',
  218. museoCoords.prefix + time) + closeLine
  219. output.write(line)
  220. line = triple(museoCoords.prefix + time,
  221. nsCoords.prefix + 'type',
  222. cidocCoords.prefix + 'E52_Time-Span') + closeLine
  223. output.write(line)
  224. line = triple(museoCoords.prefix + time,
  225. rdfsCoords.prefix + 'label',
  226. '\"' + row['AUTT'] + '\"') + closeLine
  227. output.write(line)
  228. # E21 - occupation
  229. if row['AUTQ'] != '':
  230. line = triple(e21placeHolder,
  231. schemaCoords.prefix + 'hasOccupation',
  232. museoCoords.prefix + row['AUTQ']) + closeLine
  233. output.write(line)
  234. line = triple(museoCoords.prefix + row['AUTQ'],
  235. nsCoords.prefix + 'type',
  236. schemaCoords.prefix + 'Occupation') + closeLine
  237. output.write(line)
  238. line = triple(museoCoords.prefix + row['AUTQ'],
  239. rdfsCoords.prefix + 'label',
  240. '\"' + row['AUTQ'] + '\"') + closeLine
  241. output.write(line)
  242. # E21 - P139 - E41
  243. if row['AUTV'] != '':
  244. autv = []
  245. if '/' in row['AUTV']:
  246. autv = row['AUTV'].split('/')
  247. else:
  248. autv.append(row['AUTV'])
  249. autvplaceHolder = museoCoords.prefix + row['AUTV'].replace(' ', '-').replace('\'', '')
  250. line = triple(e21placeHolder,
  251. cidocCoords.prefix + 'P139_has_alternative-form',
  252. '\"' + row['AUTV'] + '\"') + closeLine
  253. output.write(line)
  254. output.write('\n')
  255. #
  256. #
  257. # To limit number of entries processed (if desired for testing purposes)
  258. if (max_entries is not None and ii > max_entries):
  259. break