CSV_to_RDF_ASPOPeople.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. # Utilities to read/write csv files
  2. import csv
  3. import re
  4. # Utilities to handle character encodings
  5. import unicodedata
  6. # Ordered Dicts
  7. from collections import OrderedDict
  8. import json
  9. # OPZIONAL IMPORTS
  10. # For timestamping/simple speed tests
  11. from datetime import datetime
  12. # Random number generator
  13. from random import *
  14. # System & command line utilities
  15. import sys
  16. # Json for the dictionary
  17. import json
  18. import_dir = '/Users/alessiaspadi/Documents/RESTORE/temp_ASPO/'
  19. export_dir = '/Users/alessiaspadi/Documents/RESTORE/temp_ASPO/'
  20. # Custom class to store URIs + related infos for the ontologies/repositories
  21. class RDFcoords:
  22. def __init__(self, uri, prefix, code = None):
  23. self.uri = uri
  24. self.prefix = prefix
  25. self.code = code
  26. # Repositories
  27. aspoCoords = RDFcoords('<http://www.archiviodistato.prato.it/accedi-e-consulta/aspoMV001/scheda/>', 'aspo:')
  28. foafCoords = RDFcoords('<http://xmlns.com/foaf/0.1/>', 'foaf:')
  29. cidocCoords = RDFcoords('<http://www.cidoc-crm.org/cidoc-crm/>', 'crm:')
  30. schemaCoords = RDFcoords('<http://schema.org/>', 'schema:')
  31. personCoords = RDFcoords('<http://www.w3.org/ns/person#>', 'person:')
  32. nsCoords = RDFcoords('<http://www.w3.org/1999/02/22-rdf-syntax-ns#>', 'rdf:')
  33. rdfsCoords = RDFcoords('<http://www.w3.org/2000/01/rdf-schema#>', 'rdfs:')
  34. # Basic functions for triples / shortened triples in TTL format
  35. def triple(subject, predicate, object1):
  36. line = subject + ' ' + predicate + ' ' + object1
  37. return line
  38. def doublet(predicate, object1):
  39. line = ' ' + predicate + ' ' + object1
  40. return line
  41. def singlet(object1):
  42. line = ' ' + object1
  43. return line
  44. # Line endings in TTL format
  45. continueLine1 = ' ;\n'
  46. continueLine2 = ' ,\n'
  47. closeLine = ' .\n'
  48. def writeTTLHeader(output):
  49. output.write('@prefix ' + aspoCoords.prefix + ' ' + aspoCoords.uri + closeLine)
  50. output.write('@prefix ' + foafCoords.prefix + ' ' + foafCoords.uri + closeLine)
  51. output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine)
  52. output.write('@prefix ' + personCoords.prefix + ' ' + personCoords.uri + closeLine)
  53. output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine)
  54. output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine)
  55. output.write('@prefix ' + rdfsCoords.prefix + ' ' + rdfsCoords.uri + closeLine)
  56. output.write('\n')
  57. filePrefix = 'Onomastica_'
  58. fileType = 'Datini'
  59. max_entries = 1000000000
  60. with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file, open(
  61. export_dir + filePrefix + fileType + '.ttl', 'w') as output:
  62. reader = csv.DictReader(csv_file)
  63. writeTTLHeader(output)
  64. first = True
  65. ii = 0
  66. for row in reader:
  67. # The index ii is used to process a limited number of entries for testing purposes
  68. ii = ii + 1
  69. if row['entityType'] == 'person':
  70. id_aspo = row['recordId']
  71. #placeHolders
  72. aspoPlaceHolder = aspoCoords.prefix + id_aspo
  73. line = triple(aspoPlaceHolder,
  74. nsCoords.prefix + 'type',
  75. foafCoords.prefix + 'person') + closeLine
  76. output.write(line)
  77. line = triple(aspoPlaceHolder,
  78. foafCoords.prefix + 'name',
  79. '\"' + row['nameEntry@normal'] + '\"') + closeLine
  80. output.write(line)
  81. if row['nome proprio'] != '':
  82. name = row['nome proprio'].replace("\n", "")
  83. line = triple(aspoPlaceHolder,
  84. foafCoords.prefix + 'givenName',
  85. '\"' + name + '\"') + closeLine
  86. output.write(line)
  87. if row['nome di famiglia'] != '':
  88. familyName = row['nome di famiglia'].replace("\n", "")
  89. line = triple(aspoPlaceHolder,
  90. foafCoords.prefix + 'familyName',
  91. '\"' + familyName + '\"') + closeLine
  92. output.write(line)
  93. if row['Alias'] != '':
  94. line = triple(aspoPlaceHolder,
  95. schemaCoords.prefix + 'alternateName',
  96. '\"' + row['Alias'] + '\"') + closeLine
  97. output.write(line)
  98. if row['genere'] != '':
  99. line = triple(aspoPlaceHolder,
  100. foafCoords.prefix + 'gender',
  101. '\"' + row['genere'] + '\"') + closeLine
  102. output.write(line)
  103. if row['patronimico/matronimico'] != '':
  104. pp = row['patronimico/matronimico']
  105. patronimyc = pp.replace("\n", "")
  106. line = triple(aspoPlaceHolder,
  107. personCoords.prefix + 'patronymicName',
  108. '\"' + patronimyc + '\"') + closeLine
  109. output.write(line)
  110. if row['occupation'] != '' and row['occupation'] != '\n':
  111. occ = row['occupation']
  112. occupation = re.sub(r'[^A-Za-z]', '', occ)
  113. line = triple(aspoPlaceHolder,
  114. schemaCoords.prefix + 'hasOccupation',
  115. aspoCoords.prefix + occupation) + closeLine
  116. output.write(line)
  117. line = triple(aspoCoords.prefix + occupation,
  118. nsCoords.prefix + 'type',
  119. schemaCoords.prefix + 'Occupation') + closeLine
  120. output.write(line)
  121. line = triple(aspoCoords.prefix + occupation,
  122. rdfsCoords.prefix + 'label',
  123. '\"' + row['occupation'] + '\"') + closeLine
  124. output.write(line)
  125. if row['avo 1'] != '':
  126. avo1 = row['avo 1'].replace('di ', '')
  127. avo1card = re.sub(r'[^A-Za-z]', '', avo1)
  128. line = triple(aspoPlaceHolder,
  129. schemaCoords.prefix + 'relatedTo',
  130. aspoCoords.prefix + avo1card) + closeLine
  131. output.write(line)
  132. line = triple(aspoCoords.prefix + avo1card,
  133. nsCoords.prefix + 'type',
  134. foafCoords.prefix + 'Person') + closeLine
  135. output.write(line)
  136. line = triple(aspoCoords.prefix + avo1card,
  137. rdfsCoords.prefix + 'label',
  138. '\"' + avo1 + '\"') + closeLine
  139. output.write(line)
  140. if row['avo 2'] != '':
  141. avo2 = row['avo 2'].replace('di ', '')
  142. avo2card = re.sub(r'[^A-Za-z]', '', avo2)
  143. line = triple(aspoPlaceHolder,
  144. schemaCoords.prefix + 'relatedTo',
  145. aspoCoords.prefix + avo2card) + closeLine
  146. output.write(line)
  147. line = triple(aspoCoords.prefix + avo2card,
  148. nsCoords.prefix + 'type',
  149. foafCoords.prefix + 'Person') + closeLine
  150. output.write(line)
  151. line = triple(aspoCoords.prefix + avo2card,
  152. rdfsCoords.prefix + 'label',
  153. '\"' + avo2 + '\"') + closeLine
  154. output.write(line)
  155. if row['Qualifica'] != '':
  156. qq = row['Qualifica']
  157. qualifiche = []
  158. if '|' in qq:
  159. qualifiche = qq.split('|')
  160. else:
  161. qualifiche.append(qq)
  162. for qualifica in qualifiche:
  163. honorific = qualifica.replace("\n", "")
  164. line = triple(aspoPlaceHolder,
  165. schemaCoords.prefix + 'honorificPrefix',
  166. '\"' + honorific + '\"') + closeLine
  167. output.write(line)
  168. if row['place_occupation_Qualifica'] != '':
  169. line = triple(aspoPlaceHolder,
  170. schemaCoords.prefix + 'workLocation',
  171. '\"' + row['place_occupation_Qualifica'] + '\"') + closeLine
  172. output.write(line)
  173. if row['biogHist p'] != '':
  174. bio = row['biogHist p']
  175. biog = bio.replace("\n", " ")
  176. line = triple(aspoPlaceHolder,
  177. schemaCoords.prefix + 'description',
  178. '\"' + biog + '\"') + closeLine
  179. output.write(line)
  180. output.write('\n')
  181. #
  182. #
  183. # Limit number of entries processed (if desired)
  184. if (ii > max_entries):
  185. break