ontology_parser_CIDOC.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. # %%
  2. import csv
  3. import json
  4. # BASIC CONFIGURATION
  5. DATA_FOLDER = './data/'
  6. OUTPUT_FOLDER = './output/'
  7. ONTO_FILENAME = 'man_draft_CIDOC' # No extension!
  8. ent_filename = ONTO_FILENAME + '_entities.csv'
  9. rel_filename = ONTO_FILENAME + '_relations.csv'
  10. # %%
  11. # PART I: parse xlsx to (multiple) csv
  12. # CONFIGURATION
  13. XLSX_FILENAME = ONTO_FILENAME + '.xlsx'
  14. ENTITIES_SHEETNAME = 'Entità'
  15. RELATIONS_SHEETNAME = 'Relazioni'
  16. # %%
  17. # Import xlsx through openpyxl
  18. import openpyxl as op
  19. input_data = op.load_workbook(DATA_FOLDER + XLSX_FILENAME) # Explicitly specify the encoding??
  20. entities_sheet = input_data[ENTITIES_SHEETNAME]
  21. relations_sheet = input_data[RELATIONS_SHEETNAME]
  22. # %%
  23. # Export sheet data to csv
  24. with open(DATA_FOLDER + ent_filename, 'w', encoding='utf-8') as out_file:
  25. writer = csv.writer(out_file)
  26. writer.writerows(entities_sheet.values)
  27. with open(DATA_FOLDER + rel_filename, 'w', encoding='utf-8') as out_file:
  28. writer = csv.writer(out_file)
  29. writer.writerows(relations_sheet.values)
  30. # %%
  31. # PART II: collect csv data into a 'pre-ontology' structure
  32. # Read csv files back in (or use them directly as starting points)
  33. HEADER_ROW = True
  34. # Not difficult to add more keys (column names)
  35. ENTITIES_COLUMN_LABEL = 'ENTITÀ'
  36. ATTRIBUTES_COLUMN_LABEL = 'ATTRIBUTO (LITERAL)'
  37. SAMEAS_COLUMN_LABEL = 'SAME AS'
  38. #
  39. RELATION_FIRST_COLUMN_LABEL = 'ENTITÀ 1'
  40. RELATION_SECOND_COLUMN_LABEL = 'ENTITÀ 2'
  41. RELATION_NAME_COLUMN_LABEL = 'NOME RELAZIONE'
  42. INVERSE_RELATION_COLUMN_LABEL = 'NOME RELAZIONE INVERSA'
  43. #
  44. CIDOC_COLUMN_LABEL = 'CIDOC-LINK'
  45. with open(DATA_FOLDER + ent_filename, 'r', encoding='utf-8') as in_file:
  46. if HEADER_ROW:
  47. reader = csv.DictReader(in_file)
  48. else:
  49. reader = csv.DictReader(in_file, fieldnames=[ENTITIES_COLUMN_LABEL, ATTRIBUTES_COLUMN_LABEL, SAMEAS_COLUMN_LABEL, CIDOC_COLUMN_LABEL])
  50. entities = [row for row in reader]
  51. with open(DATA_FOLDER + rel_filename, 'r', encoding='utf-8') as in_file:
  52. if HEADER_ROW:
  53. reader = csv.DictReader(in_file)
  54. else:
  55. reader = csv.DictReader(in_file, fieldnames=[RELATION_FIRST_COLUMN_LABEL, RELATION_SECOND_COLUMN_LABEL, RELATION_NAME_COLUMN_LABEL, INVERSE_RELATION_COLUMN_LABEL, CIDOC_COLUMN_LABEL])
  56. relations = [row for row in reader]
  57. # %%
  58. # From here on, work with the 'entities' and 'relations' lists of dicts. Arrange them in a nested structure, for convenience
  59. def dict_lists_to_json(entities_local, relations_local):
  60. entity = {}
  61. current_entity = None
  62. for row in entities_local:
  63. entity_name = row.get(ENTITIES_COLUMN_LABEL)
  64. attribute_name = row.get(ATTRIBUTES_COLUMN_LABEL)
  65. same_as_row = row.get(SAMEAS_COLUMN_LABEL)
  66. same_as_list = same_as_row.split(',') if same_as_row else []
  67. cidoc_link = row.get(CIDOC_COLUMN_LABEL)
  68. if entity_name:
  69. current_entity = entity_name
  70. entity[current_entity] = {}
  71. if cidoc_link:
  72. entity[current_entity]['Classe CIDOC proposta'] = cidoc_link
  73. if current_entity and attribute_name:
  74. if not entity[current_entity].get('Attributi'):
  75. entity[current_entity]['Attributi'] = []
  76. attribute = {'Nome': attribute_name}
  77. if cidoc_link:
  78. attribute['Classe CIDOC proposta'] = cidoc_link
  79. entity[current_entity]['Attributi'].append(attribute)
  80. if current_entity and same_as_list:
  81. entity[current_entity]['Sinonimi'] = [s.strip() for s in same_as_list]
  82. # Add subclass information
  83. for row in relations_local:
  84. entity1 = row.get(RELATION_FIRST_COLUMN_LABEL)
  85. entity2 = row.get(RELATION_SECOND_COLUMN_LABEL)
  86. label = row.get(RELATION_NAME_COLUMN_LABEL)
  87. if label == "is_subclass_of":
  88. if entity1 in entity:
  89. entity[entity1]["Sottoclasse di"] = entity2
  90. # Construct relations
  91. entity_relations = []
  92. for row in relations_local:
  93. if row[RELATION_NAME_COLUMN_LABEL] != "is_subclass_of":
  94. relation = {
  95. "Entità 1": row[RELATION_FIRST_COLUMN_LABEL],
  96. "Entità 2": row[RELATION_SECOND_COLUMN_LABEL],
  97. "Etichetta": row[RELATION_NAME_COLUMN_LABEL],
  98. "Inversa": row[INVERSE_RELATION_COLUMN_LABEL],
  99. "Proprietà CIDOC proposta": row.get(CIDOC_COLUMN_LABEL)
  100. }
  101. entity_relations.append(relation)
  102. # Create final JSON structure
  103. data = {
  104. "Entità": entity,
  105. "Relazioni": entity_relations
  106. }
  107. return data
  108. # %%
  109. json_data = dict_lists_to_json(entities, relations)
  110. # Export data
  111. with open(OUTPUT_FOLDER + ONTO_FILENAME + '.json', 'w') as out_json:
  112. json.dump(json_data, out_json, indent=2, ensure_ascii=False)
  113. # %%
  114. # Re-read the data and do a consistency check
  115. entity_set = set(json_data['Entità'].keys())
  116. entity_relations_set = {ent for rel in json_data['Relazioni'] for ent in [rel['Entità 1'], rel['Entità 2']]}
  117. # The check
  118. if not entity_relations_set.issubset(entity_set):
  119. print(entity_relations_set.difference(entity_set))
  120. # Commento su #any
  121. # %%
  122. # RDF Templates
  123. RDF_MAIN_TEMPLATE = 'template.rdf'
  124. with open(DATA_FOLDER + RDF_MAIN_TEMPLATE, 'r') as in_file:
  125. RAW_RDF = in_file.read()
  126. # RDF snippets; info will replace placeholder tags (in uppercase between '#')
  127. ENTITY_TEMPLATE = '''
  128. <!-- http://www.h2iosc.it/onto##NAME# -->
  129. <owl:Class rdf:about="&h2iosc;#NAME#">
  130. <rdfs:label>#LABEL#</rdfs:label>
  131. <rdfs:subClassOf>#PARENT#</rdfs:subClassOf>
  132. <rdfs:isDefinedBy rdf:resource="#URI#"/>
  133. </owl:Class>
  134. '''
  135. SUBCLASS_STRING = " <rdfs:subClassOf>#PARENT#</rdfs:subClassOf>\n"
  136. CLASS_DEFINED_STRING = ' <rdfs:isDefinedBy rdf:resource="#URI#"/>\n'
  137. OBJECT_PROPERTY_TEMPLATE = '''
  138. <!-- http://www.h2iosc.it/onto##NAME# -->
  139. <owl:ObjectProperty rdf:about="&h2iosc;#NAME#">
  140. <rdfs:label>#LABEL#</rdfs:label>
  141. <rdfs:range rdf:resource="&h2iosc;#RANGE#"/>
  142. <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
  143. <rdfs:isDefinedBy rdf:resource="#URI#"/>
  144. </owl:ObjectProperty>
  145. '''
  146. OBJECT_PROPERTY_INVERSE_TEMPLATE = '''
  147. <!-- http://www.h2iosc.it/onto##NAME# -->
  148. <owl:ObjectProperty rdf:about="&h2iosc;#NAME#">
  149. <rdfs:label>#LABEL#</rdfs:label>
  150. <owl:inverseOf rdf:resource="&h2iosc;#INV#"/>
  151. <rdfs:range rdf:resource="&h2iosc;#RANGE#"/>
  152. <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
  153. <rdfs:isDefinedBy rdf:resource="#URI#"/>
  154. </owl:ObjectProperty>
  155. '''
  156. OBJECT_DEFINED_STRING = ' <rdfs:isDefinedBy rdf:resource="#URI#"/>\n'
  157. DATATYPE_PROPERTY_TEMPLATE = '''
  158. <!-- http://www.h2iosc.it/onto##NAME# -->
  159. <owl:DatatypeProperty rdf:about="&h2iosc;#NAME#" rdf:isDefinedBy="#URI#">
  160. <rdfs:label>#LABEL#</rdfs:label>
  161. <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
  162. <rdfs:isDefinedBy rdf:resource="#URI#"/>
  163. </owl:DatatypeProperty>
  164. '''
  165. DATATYPE_DEFINED_STRING = ' <rdfs:isDefinedBy rdf:resource="#URI#"/>\n'
  166. # Utility
  167. def normalize_label(label):
  168. return label.lower().replace(' ', '_').replace('à', 'a').replace('è', 'e').replace('é', 'e').replace('ì', 'i').replace('ò', 'o').replace('ù', 'u')
  169. # %%
  170. # CREATE RDF OUTPUT
  171. def create_rdf(data):
  172. entities_rdf_list = []
  173. datatype_properties_rdf_list = []
  174. for label, ent in data['Entità'].items():
  175. entity_name = normalize_label(label)
  176. entity_rdf = ENTITY_TEMPLATE.replace('#LABEL#', label).replace('#NAME#', entity_name)
  177. #
  178. cidoc_class = ent.get('Classe CIDOC proposta')
  179. if cidoc_class:
  180. entity_rdf = entity_rdf.replace('#URI#', cidoc_class)
  181. else:
  182. entity_rdf = entity_rdf.replace(CLASS_DEFINED_STRING, '')
  183. # Subclasses
  184. if 'Sottoclasse di' in ent.keys():
  185. parent = ent['Sottoclasse di']
  186. data['Relazioni'].append({"Entità 1": label,
  187. "Entità 2": parent,
  188. "Etichetta": "is_subclass_of", "Inversa": "is_superclass_of"})
  189. entity_rdf = entity_rdf.replace('#PARENT#', normalize_label(parent))
  190. else:
  191. entity_rdf = entity_rdf.replace(SUBCLASS_STRING, '')
  192. entities_rdf_list.append(entity_rdf)
  193. if not ent.get('Attributi'):
  194. continue
  195. for datatype in ent['Attributi']:
  196. datatype_label = datatype['Nome']
  197. datatype_name = normalize_label(datatype_label)
  198. #
  199. datatype_rdf = DATATYPE_PROPERTY_TEMPLATE.replace('#LABEL#', datatype_label).replace('#NAME#', datatype_name).replace('#DOMAIN#', entity_name)
  200. #
  201. datatype_cidoc_class = datatype.get('Classe CIDOC proposta')
  202. if datatype_cidoc_class:
  203. datatype_rdf = datatype_rdf.replace('#URI#', datatype_cidoc_class)
  204. else:
  205. datatype_rdf = datatype_rdf.replace(DATATYPE_DEFINED_STRING, '')
  206. #
  207. datatype_properties_rdf_list.append(datatype_rdf)
  208. relations_rdf_list = []
  209. for rel in data['Relazioni']:
  210. label = rel['Etichetta']
  211. inverse_label = rel['Inversa']
  212. domain = normalize_label(rel['Entità 1'])
  213. range1 = normalize_label(rel['Entità 2'])
  214. name = domain + '_' + normalize_label(label) + '_' + range1
  215. inverse_name = range1 + '_' + normalize_label(inverse_label) + '_' + domain
  216. #
  217. relation_rdf = OBJECT_PROPERTY_TEMPLATE.replace('#NAME#', name).replace('#LABEL#', label).replace('#DOMAIN#', domain).replace('#RANGE#', range1)
  218. #
  219. relation_cidoc_class = rel.get(CIDOC_COLUMN_LABEL)
  220. if relation_cidoc_class:
  221. relation_rdf = relation_rdf.replace('#URI#', relation_cidoc_class)
  222. else:
  223. relation_rdf = relation_rdf.replace(OBJECT_DEFINED_STRING, '')
  224. #
  225. relation_inverse_rdf = OBJECT_PROPERTY_INVERSE_TEMPLATE.replace('#NAME#', inverse_name).replace('#LABEL#', inverse_label).replace('#DOMAIN#', range1).replace('#RANGE#', domain).replace('#INV#', name).replace(CLASS_DEFINED_STRING, '')
  226. #
  227. relation_full_rdf = relation_rdf + '\n\n\n' + relation_inverse_rdf
  228. relations_rdf_list.append(relation_full_rdf)
  229. to_out = RAW_RDF.replace(ENTITY_TEMPLATE, '\n\n\n'.join(entities_rdf_list)).replace(DATATYPE_PROPERTY_TEMPLATE, '\n\n\n'.join(datatype_properties_rdf_list)
  230. ).replace(OBJECT_PROPERTY_INVERSE_TEMPLATE, '\n\n\n'.join(relations_rdf_list))
  231. return to_out
  232. # %%
  233. rdf_data = create_rdf(json_data)
  234. # Export
  235. with open(OUTPUT_FOLDER + ONTO_FILENAME + '.rdf', 'w') as out_file:
  236. out_file.write(rdf_data)
  237. # %%
  238. # https://service.tib.eu/webvowl/
  239. # %%