CSV_to_RDF_generico.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. ## IMPORTS
  2. # Utilities to read/write csv files
  3. import csv, json
  4. from operator import truediv
  5. # Custom class to store URIs + related infos for the ontologies/repositories
  6. class RDFcoords:
  7. def __init__(self, uri, prefix, code = None):
  8. self.uri = uri
  9. self.prefix = prefix
  10. self.code = code
  11. # Repositories
  12. museoCoords = RDFcoords('<https://palazzopretorio.prato.it/it/le-opere/alcuni-capolavori/>', 'mpp:')
  13. autCoords = RDFcoords('<https://palazzopretorio.prato.it/it/opere/autori/>', 'aut:')
  14. foafCoords = RDFcoords('<http://xmlns.com/foaf/0.1/>', 'foaf:')
  15. cidocCoords = RDFcoords('<http://www.cidoc-crm.org/cidoc-crm/>', 'crm:')
  16. aatCoords = RDFcoords('<http://vocab.getty.edu/aat/>', 'aat:')
  17. nsCoords = RDFcoords('<http://www.w3.org/1999/02/22-rdf-syntax-ns#>', 'rdf:')
  18. schemaCoords = RDFcoords('<http://www.schema.org/>', 'schema:')
  19. rdfsCoords = RDFcoords('<http://www.w3.org/2000/01/rdf-schema#>', 'rdfs:')
  20. # Basic utilities to format triples / shortened triples in TTL format
  21. #
  22. # Format full triple
  23. def triple(subject, predicate, object1):
  24. line = subject + ' ' + predicate + ' ' + object1
  25. return line
  26. # Format entry in predicate list (no subject)
  27. def doublet(predicate, object1):
  28. line = ' ' + predicate + ' ' + object1
  29. return line
  30. # Format entry in object list (object only)
  31. def singlet(object1):
  32. line = ' ' + object1
  33. return line
  34. # Line endings
  35. continueLine1 = ' ;\n' # Before a predicate list, that is if the FOLLOWING triple has the same subject
  36. continueLine2 = ' ,\n' # Before an object list, that is if the FOLLOWING triple has the same subject and predicate
  37. closeLine = ' .\n' # To end a triple / a triples block
  38. def writeTTLHeader(output):
  39. output.write('@prefix ' + museoCoords.prefix + ' ' + museoCoords.uri + closeLine)
  40. output.write('@prefix ' + foafCoords.prefix + ' ' + foafCoords.uri + closeLine)
  41. output.write('@prefix ' + autCoords.prefix + ' ' + autCoords.uri + closeLine)
  42. output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine)
  43. output.write('@prefix ' + aatCoords.prefix + ' ' + aatCoords.uri + closeLine)
  44. output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine)
  45. output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine)
  46. output.write('@prefix ' + rdfsCoords.prefix + ' ' + rdfsCoords.uri + closeLine)
  47. output.write('\n')
  48. max_entries = None
  49. def parsefromfile(mapfilename, infile, outfilename):
  50. inputFile = infile.decode()
  51. csv_dicts = [{k: v for k, v in row.items()} for row in csv.DictReader(inputFile.splitlines(), skipinitialspace=True)]
  52. with open (mapfilename) as mapfile:
  53. json_dicts = json.load(mapfile)
  54. parse(json_dicts, csv_dicts, outfilename)
  55. def parse(json_dicts, csv_dicts, outfilename):
  56. with open(outfilename, 'w') as outputfile:
  57. writeTTLHeader(outputfile)
  58. first = True # In case something needs processing only once for the whole CSV input
  59. for ii, csvrow in enumerate(csv_dicts):
  60. # Skip the first line as it carries info we don't want to triplify
  61. if(first):
  62. first = False
  63. continue
  64. # The index ii is mainly used to limit the number of entries to process, for testing purposes
  65. for jj, node in enumerate(json_dicts):
  66. if type(node["colonna"]) is list:
  67. csvvalue = [csvrow[col] for col in node["colonna"]]
  68. else:
  69. csvvalue = csvrow[node["colonna"]]
  70. if checkEmptyValue(csvvalue):
  71. continue
  72. line = triple(settripleuri(csvvalue, node["uri"]), nsCoords.prefix + 'type', node["tipo"]) + closeLine
  73. outputfile.write(line)
  74. if node["sottoelementodi"] != '':
  75. parent = next (filter(lambda el: el["identificatore"]==node["sottoelementodi"], json_dicts), None)
  76. if parent is not None:
  77. if type(parent["colonna"]) is list:
  78. parent_csvvalue = [csvrow[col] for col in parent["colonna"]]
  79. else:
  80. parent_csvvalue = csvrow[parent["colonna"]]
  81. subject = settripleuri(parent_csvvalue, parent["uri"])
  82. property = node["relazione"]
  83. object = settripleuri(csvvalue, node["uri"])
  84. line = triple(subject, property,
  85. object) + closeLine
  86. outputfile.write(line)
  87. outputfile.write('\n')
  88. #
  89. #
  90. # To limit number of entries processed (if desired for testing purposes)
  91. if (max_entries is not None and ii > max_entries):
  92. break
  93. def settripleuri (csvvalue, nodeuri):
  94. output = "\""+nodeuri+"\""
  95. if type(csvvalue) is list:
  96. for ii, value in enumerate(csvvalue):
  97. if value=='':
  98. output = output.replace('$VALORE_CSV_'+ str(ii)+'$', 'N/A')
  99. else:
  100. output = output.replace('$VALORE_CSV_'+ str(ii)+'$', value)
  101. else:
  102. output = output.replace('$VALORE_CSV$', csvvalue)
  103. return output
  104. def checkEmptyValue(csvvalue):
  105. if type(csvvalue) is list:
  106. emptyList = ['' for el in csvvalue]
  107. if emptyList==csvvalue:
  108. return True
  109. if csvvalue=='':
  110. return True
  111. return False