CSV_to_RDF_generico.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. ## IMPORTS
  2. # Utilities to read/write csv files
  3. import csv, json
  4. from operator import truediv
  5. # Custom class to store URIs + related infos for the ontologies/repositories
  6. class RDFcoords:
  7. def __init__(self, uri, prefix, code = None):
  8. self.uri = uri
  9. self.prefix = prefix
  10. self.code = code
  11. # Repositories
  12. museoCoords = RDFcoords('<https://palazzopretorio.prato.it/it/le-opere/alcuni-capolavori/>', 'mpp:')
  13. autCoords = RDFcoords('<https://palazzopretorio.prato.it/it/opere/autori/>', 'aut:')
  14. foafCoords = RDFcoords('<http://xmlns.com/foaf/0.1/>', 'foaf:')
  15. cidocCoords = RDFcoords('<http://www.cidoc-crm.org/cidoc-crm/>', 'crm:')
  16. aatCoords = RDFcoords('<http://vocab.getty.edu/aat/>', 'aat:')
  17. nsCoords = RDFcoords('<http://www.w3.org/1999/02/22-rdf-syntax-ns#>', 'rdf:')
  18. schemaCoords = RDFcoords('<http://www.schema.org/>', 'schema:')
  19. rdfsCoords = RDFcoords('<http://www.w3.org/2000/01/rdf-schema#>', 'rdfs:')
  20. # Basic utilities to format triples / shortened triples in TTL format
  21. #
  22. # Format full triple
  23. def triple(subject, predicate, object1):
  24. line = subject + ' ' + predicate + ' ' + object1
  25. return line
  26. # Format entry in predicate list (no subject)
  27. def doublet(predicate, object1):
  28. line = ' ' + predicate + ' ' + object1
  29. return line
  30. # Format entry in object list (object only)
  31. def singlet(object1):
  32. line = ' ' + object1
  33. return line
  34. # Line endings
  35. continueLine1 = ' ;\n' # Before a predicate list, that is if the FOLLOWING triple has the same subject
  36. continueLine2 = ' ,\n' # Before an object list, that is if the FOLLOWING triple has the same subject and predicate
  37. closeLine = ' .\n' # To end a triple / a triples block
  38. def writeTTLHeader(output):
  39. output.write('@prefix ' + museoCoords.prefix + ' ' + museoCoords.uri + closeLine)
  40. output.write('@prefix ' + foafCoords.prefix + ' ' + foafCoords.uri + closeLine)
  41. output.write('@prefix ' + autCoords.prefix + ' ' + autCoords.uri + closeLine)
  42. output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine)
  43. output.write('@prefix ' + aatCoords.prefix + ' ' + aatCoords.uri + closeLine)
  44. output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine)
  45. output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine)
  46. output.write('@prefix ' + rdfsCoords.prefix + ' ' + rdfsCoords.uri + closeLine)
  47. output.write('\n')
  48. max_entries = None
  49. def parsefromfile(mapfilename, formFields, infile, outfilename):
  50. inputFile = infile.decode()
  51. csv_dicts = [{k: v for k, v in row.items()} for row in csv.DictReader(inputFile.splitlines(), skipinitialspace=True)]
  52. parse(mapfilename, formFields, csv_dicts, outfilename)
  53. def parse(mapfilename, formFields, csv_dicts, outfilename):
  54. with open (mapfilename) as mapfile:
  55. triple_blocks = json.load(mapfile)
  56. allRefs = getRefs(triple_blocks)
  57. doReplace = True
  58. while doReplace:
  59. doReplace = False
  60. for ref in allRefs['subjects_with_refs']:
  61. oldVal = ref['value']
  62. newVal = replaceRefs(allRefs, oldVal)
  63. if(oldVal != newVal):
  64. ref['value'] = newVal
  65. doReplace = True
  66. for ref in allRefs['objects_with_refs']:
  67. oldVal = ref['value']
  68. newVal = replaceRefs(allRefs, oldVal)
  69. if(oldVal != newVal):
  70. ref['value'] = newVal
  71. doReplace = True
  72. with open(outfilename, 'w') as outputfile:
  73. writeTTLHeader(outputfile)
  74. for csvrow in csv_dicts:
  75. for entry in triple_blocks:
  76. subject = entry['subject']['value'] if type(entry['subject']) is dict else entry['subject']
  77. subject = replaceRefs(allRefs, subject)
  78. subject = replace_csv_values(formFields, csvrow, subject)
  79. for content in entry['content']:
  80. attribute = content['predicate']
  81. object1 = content['object']['value'] if type(content['object']) is dict else content['object']
  82. object1 = replaceRefs(allRefs, object1)
  83. object1 = replace_csv_values(formFields, csvrow, object1)
  84. toWrite = triple(subject, attribute, object1)
  85. outputfile.write(toWrite)
  86. outputfile.write(closeLine)
  87. outputfile.write('\n')
  88. def getRefs(triple_blocks: dict):
  89. subjects_with_refs = []
  90. for block in triple_blocks:
  91. try:
  92. subject_ref = block['subject']['ref']
  93. except:
  94. subject_ref = None
  95. if subject_ref is not None:
  96. subjects_with_refs.append(block['subject'])
  97. objects_with_refs = []
  98. for block in triple_blocks:
  99. for content in block['content']:
  100. try:
  101. object_ref = content['object']['ref']
  102. except:
  103. object_ref = None
  104. if object_ref is not None:
  105. objects_with_refs.append(content['object'])
  106. return {'subjects_with_refs': subjects_with_refs, 'objects_with_refs': objects_with_refs}
  107. def replace_csv_values(formFields: list, csvrow: dict, val: str):
  108. outStr = val
  109. for field in formFields:
  110. outStr = outStr.replace('#csv:'+field+'#', csvrow[field])
  111. return outStr
  112. def replaceRefs(allRefs, val):
  113. outStr = val
  114. for ref in allRefs['subjects_with_refs']:
  115. outStr = outStr.replace('#ref:'+ref['ref']+'#', ref['value'])
  116. for ref in allRefs['objects_with_refs']:
  117. outStr = outStr.replace('#obj_ref:'+ref['ref']+'#', ref['value'])
  118. return outStr