CSV_to_RDF_generico.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. ## IMPORTS
  2. # Utilities to read/write csv files
  3. import csv, json
  4. from operator import truediv
  5. # Custom class to store URIs + related infos for the ontologies/repositories
  6. class RDFcoords:
  7. def __init__(self, uri, prefix, code = None):
  8. self.uri = uri
  9. self.prefix = prefix
  10. self.code = code
  11. # Repositories
  12. museoCoords = RDFcoords('<https://palazzopretorio.prato.it/it/le-opere/alcuni-capolavori/>', 'mpp:')
  13. aspoCoords = RDFcoords('<http://www.archiviodistato.prato.it/accedi-e-consulta/aspoMV001/scheda/>', 'aspo:')
  14. autCoords = RDFcoords('<https://palazzopretorio.prato.it/it/opere/autori/>', 'aut:')
  15. foafCoords = RDFcoords('<http://xmlns.com/foaf/0.1/>', 'foaf:')
  16. cidocCoords = RDFcoords('<http://www.cidoc-crm.org/cidoc-crm/>', 'crm:')
  17. aatCoords = RDFcoords('<http://vocab.getty.edu/aat/>', 'aat:')
  18. nsCoords = RDFcoords('<http://www.w3.org/1999/02/22-rdf-syntax-ns#>', 'rdf:')
  19. schemaCoords = RDFcoords('<http://www.schema.org/>', 'schema:')
  20. rdfsCoords = RDFcoords('<http://www.w3.org/2000/01/rdf-schema#>', 'rdfs:')
  21. owlCoords = RDFcoords('<http://www.w3.org/2002/07/owl#>', 'owl:')
  22. # Basic utilities to format triples / shortened triples in TTL format
  23. #
  24. # Format full triple
  25. def triple(subject, predicate, object1):
  26. line = subject + ' ' + predicate + ' ' + object1
  27. return line
  28. # Format entry in predicate list (no subject)
  29. def doublet(predicate, object1):
  30. line = ' ' + predicate + ' ' + object1
  31. return line
  32. # Format entry in object list (object only)
  33. def singlet(object1):
  34. line = ' ' + object1
  35. return line
  36. # Line endings
  37. continueLine1 = ' ;\n' # Before a predicate list, that is if the FOLLOWING triple has the same subject
  38. continueLine2 = ' ,\n' # Before an object list, that is if the FOLLOWING triple has the same subject and predicate
  39. closeLine = ' .\n' # To end a triple / a triples block
  40. def writeTTLHeader(output):
  41. output.write('@prefix ' + museoCoords.prefix + ' ' + museoCoords.uri + closeLine)
  42. output.write('@prefix ' + foafCoords.prefix + ' ' + foafCoords.uri + closeLine)
  43. output.write('@prefix ' + autCoords.prefix + ' ' + autCoords.uri + closeLine)
  44. output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine)
  45. output.write('@prefix ' + aatCoords.prefix + ' ' + aatCoords.uri + closeLine)
  46. output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine)
  47. output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine)
  48. output.write('@prefix ' + rdfsCoords.prefix + ' ' + rdfsCoords.uri + closeLine)
  49. output.write('@prefix ' + owlCoords.prefix + ' ' + owlCoords.uri + closeLine)
  50. output.write('@prefix ' + aspoCoords.prefix + ' ' + aspoCoords.uri + closeLine)
  51. output.write('\n')
  52. max_entries = None
  53. def parsefromfile(mapfilename, formFields, infile, outfilename):
  54. inputFile = infile.decode()
  55. csv_dicts = [{k: v for k, v in row.items()} for row in csv.DictReader(inputFile.splitlines(), skipinitialspace=True)]
  56. parse(mapfilename, formFields, csv_dicts, outfilename)
  57. def parse(mapfilename, formFields, csv_dicts, outfilename):
  58. with open (mapfilename) as mapfile:
  59. triple_blocks = json.load(mapfile)
  60. allRefs = getRefs(triple_blocks)
  61. doReplace = True
  62. while doReplace:
  63. doReplace = False
  64. for ref in allRefs['subjects_with_refs']:
  65. oldVal = ref['value']
  66. newVal = replaceRefs(allRefs, oldVal)
  67. if(oldVal != newVal):
  68. ref['value'] = newVal
  69. doReplace = True
  70. for ref in allRefs['objects_with_refs']:
  71. oldVal = ref['value']
  72. newVal = replaceRefs(allRefs, oldVal)
  73. if(oldVal != newVal):
  74. ref['value'] = newVal
  75. doReplace = True
  76. with open(outfilename, 'w') as outputfile:
  77. writeTTLHeader(outputfile)
  78. for csvrow in csv_dicts:
  79. for entry in triple_blocks:
  80. subject = entry['subject']['value'] if type(entry['subject']) is dict else entry['subject']
  81. subject = replaceRefs(allRefs, subject)
  82. subject = replace_csv_values(formFields, csvrow, subject)
  83. for content in entry['content']:
  84. attribute = content['predicate']
  85. object1 = content['object']['value'] if type(content['object']) is dict else content['object']
  86. object1 = replaceRefs(allRefs, object1)
  87. object1 = replace_csv_values(formFields, csvrow, object1)
  88. toWrite = triple(subject, attribute, object1)
  89. outputfile.write(toWrite)
  90. outputfile.write(closeLine)
  91. outputfile.write('\n')
  92. def getRefs(triple_blocks: dict):
  93. subjects_with_refs = []
  94. for block in triple_blocks:
  95. try:
  96. subject_ref = block['subject']['ref']
  97. except:
  98. subject_ref = None
  99. if subject_ref is not None:
  100. subjects_with_refs.append(block['subject'])
  101. objects_with_refs = []
  102. for block in triple_blocks:
  103. for content in block['content']:
  104. try:
  105. object_ref = content['object']['ref']
  106. except:
  107. object_ref = None
  108. if object_ref is not None:
  109. objects_with_refs.append(content['object'])
  110. return {'subjects_with_refs': subjects_with_refs, 'objects_with_refs': objects_with_refs}
  111. def replace_csv_values(formFields: list, csvrow: dict, val: str):
  112. outStr = val
  113. for field in formFields:
  114. outStr = outStr.replace('#csv:'+field+'#', csvrow[field])
  115. return outStr
  116. def replaceRefs(allRefs, val):
  117. outStr = val
  118. for ref in allRefs['subjects_with_refs']:
  119. outStr = outStr.replace('#ref:'+ref['ref']+'#', ref['value'])
  120. for ref in allRefs['objects_with_refs']:
  121. outStr = outStr.replace('#obj_ref:'+ref['ref']+'#', ref['value'])
  122. return outStr