ontology_parser.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. # %%
  2. import csv
  3. import json
  4. # BASIC CONFIGURATION
  5. DATA_FOLDER = './data/'
  6. OUTPUT_FOLDER = './output/'
  7. ONTO_FILENAME = 'manoscritti_dariah' # No extension!
  8. ent_filename = ONTO_FILENAME + '_entities.csv'
  9. rel_filename = ONTO_FILENAME + '_relations.csv'
  10. # %%
  11. # %%
  12. # PART II: collect csv data into a 'pre-ontology' structure
  13. # Read csv files back in (or use them directly as starting points)
  14. HEADER_ROW = True
  15. # Not difficult to add more keys (column names)
  16. ENTITIES_COLUMN_LABEL = 'ENTITÀ'
  17. ATTRIBUTES_COLUMN_LABEL = 'ATTRIBUTO (LITERAL)'
  18. SAMEAS_COLUMN_LABEL = 'SAME AS'
  19. #
  20. RELATION_FIRST_COLUMN_LABEL = 'ENTITÀ 1'
  21. RELATION_SECOND_COLUMN_LABEL = 'ENTITÀ 2'
  22. RELATION_NAME_COLUMN_LABEL = 'NOME RELAZIONE'
  23. INVERSE_RELATION_COLUMN_LABEL = 'NOME RELAZIONE INVERSA'
  24. #
  25. with open(OUTPUT_FOLDER + ent_filename, 'r', encoding='utf-8') as in_file:
  26. if HEADER_ROW:
  27. reader = csv.DictReader(in_file)
  28. else:
  29. reader = csv.DictReader(in_file, fieldnames=[ENTITIES_COLUMN_LABEL, ATTRIBUTES_COLUMN_LABEL, SAMEAS_COLUMN_LABEL])
  30. entities = [row for row in reader]
  31. # Post-process: check & correct homonyms in attribute names
  32. attribute_counts = {}
  33. for ent in entities:
  34. if ent.get(ATTRIBUTES_COLUMN_LABEL):
  35. attribute = ent[ATTRIBUTES_COLUMN_LABEL]
  36. if not attribute_counts.get(attribute):
  37. attribute_counts[attribute] = 1
  38. else:
  39. attribute_counts[attribute] = attribute_counts[attribute] + 1
  40. #
  41. attribute_counts = {attr: count for attr, count in attribute_counts.items() if count>1}
  42. #
  43. for ent in entities:
  44. if ent.get(ATTRIBUTES_COLUMN_LABEL):
  45. attribute = ent[ATTRIBUTES_COLUMN_LABEL]
  46. if attribute_counts.get(attribute):
  47. ent[ATTRIBUTES_COLUMN_LABEL] = attribute + '_' + ent[ENTITIES_COLUMN_LABEL]
  48. # Only used for cross-check
  49. with open(OUTPUT_FOLDER + ent_filename.replace('.csv', '_aux.csv'), 'w', encoding='utf-8', newline='\n') as out_file_aux:
  50. writer = csv.DictWriter(out_file_aux, fieldnames=[ENTITIES_COLUMN_LABEL, ATTRIBUTES_COLUMN_LABEL, SAMEAS_COLUMN_LABEL])
  51. writer.writeheader()
  52. writer.writerows(entities)
  53. with open(OUTPUT_FOLDER + rel_filename, 'r', encoding='utf-8') as in_file:
  54. if HEADER_ROW:
  55. reader = csv.DictReader(in_file)
  56. else:
  57. reader = csv.DictReader(in_file, fieldnames=[RELATION_FIRST_COLUMN_LABEL, RELATION_SECOND_COLUMN_LABEL, RELATION_NAME_COLUMN_LABEL, INVERSE_RELATION_COLUMN_LABEL])
  58. relations = [row for row in reader]
  59. # %%
  60. # From here on, work with the 'entities' and 'relations' lists of dicts. Arrange them in a nested structure, for convenience
  61. def dict_lists_to_json(entities_local, relations_local):
  62. entity = {}
  63. current_entity = None
  64. for row in entities_local:
  65. entity_name = row.get(ENTITIES_COLUMN_LABEL)
  66. attribute_name = row.get(ATTRIBUTES_COLUMN_LABEL)
  67. same_as_row = row.get(SAMEAS_COLUMN_LABEL)
  68. same_as_list = same_as_row.split(',') if same_as_row else []
  69. if entity_name:
  70. current_entity = entity_name
  71. if not entity.get(current_entity):
  72. entity[current_entity] = {}
  73. if current_entity and attribute_name:
  74. if not entity[current_entity].get('Attributi'):
  75. entity[current_entity]['Attributi'] = []
  76. entity[current_entity]['Attributi'].append(attribute_name)
  77. if current_entity and same_as_list:
  78. entity[current_entity]['Sinonimi'] = [s.strip() for s in same_as_list]
  79. # Add subclass information
  80. for row in relations_local:
  81. entity1 = row.get(RELATION_FIRST_COLUMN_LABEL)
  82. entity2 = row.get(RELATION_SECOND_COLUMN_LABEL)
  83. label = row.get(RELATION_NAME_COLUMN_LABEL)
  84. if label == "is_subclass_of":
  85. if entity1 in entity:
  86. entity[entity1]["Sottoclasse di"] = entity2
  87. # Construct relations
  88. entity_relations = []
  89. for row in relations_local:
  90. if row[RELATION_NAME_COLUMN_LABEL] != "is_subclass_of":
  91. relation = {
  92. "Entità 1": row[RELATION_FIRST_COLUMN_LABEL],
  93. "Entità 2": row[RELATION_SECOND_COLUMN_LABEL],
  94. "Etichetta": row[RELATION_NAME_COLUMN_LABEL],
  95. "Inversa": row[INVERSE_RELATION_COLUMN_LABEL]
  96. }
  97. entity_relations.append(relation)
  98. # Create final JSON structure
  99. data = {
  100. "Entità": entity,
  101. "Relazioni": entity_relations
  102. }
  103. return data
  104. # %%
  105. json_data = dict_lists_to_json(entities, relations)
  106. # Export data
  107. with open(OUTPUT_FOLDER + ONTO_FILENAME + '.json', 'w', encoding='utf-8') as out_json:
  108. json.dump(json_data, out_json, indent=2, ensure_ascii=False)
  109. # %%
  110. # Re-read the data and do a consistency check
  111. entity_set = set(json_data['Entità'].keys())
  112. entity_relations_set = {ent for rel in json_data['Relazioni'] for ent in [rel['Entità 1'], rel['Entità 2']]}
  113. # The check
  114. if not entity_relations_set.issubset(entity_set):
  115. print(entity_relations_set.difference(entity_set))
  116. # Commento su #any
  117. # %%
  118. # RDF Templates
  119. RDF_MAIN_TEMPLATE = 'template.rdf'
  120. with open(DATA_FOLDER + RDF_MAIN_TEMPLATE, 'r') as in_file:
  121. RAW_RDF = in_file.read()
  122. # RDF snippets; info will replace placeholder tags (in uppercase between '#')
  123. ENTITY_TEMPLATE = '''
  124. <!-- http://www.h2iosc.it/onto##NAME# -->
  125. <owl:Class rdf:about="&h2iosc;#NAME#">
  126. <rdfs:label>#LABEL#</rdfs:label>
  127. <rdfs:subClassOf>#PARENT#</rdfs:subClassOf>
  128. </owl:Class>
  129. '''
  130. SUBCLASS_STRING = " <rdfs:subClassOf>#PARENT#</rdfs:subClassOf>\n"
  131. OBJECT_PROPERTY_TEMPLATE = '''
  132. <!-- http://www.h2iosc.it/onto##NAME# -->
  133. <owl:ObjectProperty rdf:about="&h2iosc;#NAME#">
  134. <rdfs:label>#LABEL#</rdfs:label>
  135. <rdfs:range rdf:resource="&h2iosc;#RANGE#"/>
  136. <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
  137. </owl:ObjectProperty>
  138. '''
  139. OBJECT_PROPERTY_INVERSE_TEMPLATE = '''
  140. <!-- http://www.h2iosc.it/onto##NAME# -->
  141. <owl:ObjectProperty rdf:about="&h2iosc;#NAME#">
  142. <rdfs:label>#LABEL#</rdfs:label>
  143. <owl:inverseOf rdf:resource="&h2iosc;#INV#"/>
  144. <rdfs:range rdf:resource="&h2iosc;#RANGE#"/>
  145. <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
  146. </owl:ObjectProperty>
  147. '''
  148. DATATYPE_PROPERTY_TEMPLATE = '''
  149. <!-- http://www.h2iosc.it/onto##NAME# -->
  150. <owl:DatatypeProperty rdf:about="&h2iosc;#NAME#">
  151. <rdfs:label>#LABEL#</rdfs:label>
  152. <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
  153. </owl:DatatypeProperty>
  154. '''
  155. # Utility
  156. def normalize_label(label):
  157. return label.lower().replace(' ', '_').replace('à', 'a').replace('è', 'e').replace('é', 'e').replace('ì', 'i').replace('ò', 'o').replace('ù', 'u')
  158. # %%
  159. # CREATE RDF OUTPUT
  160. def create_rdf(data):
  161. entities_rdf_list = []
  162. datatype_properties_rdf_list = []
  163. for label, ent in data['Entità'].items():
  164. entity_name = normalize_label(label)
  165. entity_rdf = ENTITY_TEMPLATE.replace('#LABEL#', label).replace('#NAME#', entity_name)
  166. # Subclasses
  167. if 'Sottoclasse di' in ent.keys():
  168. parent = ent['Sottoclasse di']
  169. data['Relazioni'].append({"Entità 1": label,
  170. "Entità 2": parent,
  171. "Etichetta": "is_subclass_of", "Inversa": "is_superclass_of"})
  172. entity_rdf = entity_rdf.replace('#PARENT#', normalize_label(parent))
  173. else:
  174. entity_rdf = entity_rdf.replace(SUBCLASS_STRING, '')
  175. entities_rdf_list.append(entity_rdf)
  176. if not ent.get('Attributi'):
  177. continue
  178. for datatype_label in ent['Attributi']:
  179. datatype_name = normalize_label(datatype_label)
  180. datatype_properties_rdf_list.append(
  181. DATATYPE_PROPERTY_TEMPLATE.replace('#LABEL#', datatype_label).replace(
  182. '#NAME#', datatype_name
  183. ).replace('#DOMAIN#', entity_name)
  184. )
  185. relations_rdf_list = []
  186. for rel in data['Relazioni']:
  187. label = rel['Etichetta']
  188. inverse_label = rel['Inversa']
  189. domain = normalize_label(rel['Entità 1'])
  190. range1 = normalize_label(rel['Entità 2'])
  191. name = domain + '_' + normalize_label(label) + '_' + range1
  192. inverse_name = range1 + '_' + normalize_label(inverse_label) + '_' + domain
  193. #
  194. relation_rdf = OBJECT_PROPERTY_TEMPLATE.replace('#NAME#', name).replace('#LABEL#', label).replace('#DOMAIN#', domain).replace('#RANGE#', range1)
  195. #
  196. relation_inverse_rdf = OBJECT_PROPERTY_INVERSE_TEMPLATE.replace('#NAME#', inverse_name).replace('#LABEL#', inverse_label).replace('#DOMAIN#', range1).replace('#RANGE#', domain).replace('#INV#', name)
  197. #
  198. relation_full_rdf = relation_rdf + '\n\n\n' + relation_inverse_rdf
  199. relations_rdf_list.append(relation_full_rdf)
  200. to_out = RAW_RDF.replace(ENTITY_TEMPLATE, '\n\n\n'.join(entities_rdf_list)).replace(DATATYPE_PROPERTY_TEMPLATE, '\n\n\n'.join(datatype_properties_rdf_list)
  201. ).replace(OBJECT_PROPERTY_INVERSE_TEMPLATE, '\n\n\n'.join(relations_rdf_list))
  202. return to_out
  203. # %%
  204. rdf_data = create_rdf(json_data)
  205. # Export
  206. with open(OUTPUT_FOLDER + ONTO_FILENAME + '.rdf', 'w') as out_file:
  207. out_file.write(rdf_data)
  208. # %%
  209. # Easy way to visualize the Ontology: upload it to:
  210. #
  211. # https://service.tib.eu/webvowl/
  212. # %%