reparsing_2_TO_REVIEW.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. # %%
  2. import json
  3. import csv
  4. # IMPORT MASTER FILE
  5. with open('.dat/man_draft.json', 'r') as in_file:
  6. data = json.load(in_file)
  7. with open('entities_in.csv', 'r') as csv_in_1:
  8. reader = csv.DictReader(csv_in_1)
  9. ent_csv_in = [row for row in reader]
  10. with open('relations_in.csv', 'r') as csv_in_2:
  11. reader = csv.DictReader(csv_in_2)
  12. rel_csv_in = [row for row in reader]
  13. # %%
  14. # Consistency check
  15. entity_rels = {ent for rel in data['Relazioni'] for ent in [rel['Entità 1'], rel['Entità 2']]}
  16. entities = set(data['Entità'].keys())
  17. entities.add('#any') # For compatibility
  18. if not entity_rels.issubset(entities):
  19. print(entity_rels.difference(entities))
  20. # %%
  21. # USE A DIRTY SHORTCUT: paste entity/relation info on a precompiled rdf template file.
  22. # Load template
  23. with open('template_2.rdf', 'r') as in_file:
  24. raw_rdf = in_file.read()
  25. # Defined rdf snippets; info will replace placeholder tags (in uppercase between '#')
  26. entity_template = '''
  27. <!-- http://www.h2iosc.it/onto##NAME# -->
  28. <owl:Class rdf:about="&h2iosc;#NAME#">
  29. <rdfs:label>#LABEL#</rdfs:label>
  30. <rdfs:subClassOf>#PARENT#</rdfs:subClassOf>
  31. <rdfs:isDefinedBy rdf:resource="#URI#"/>
  32. </owl:Class>
  33. '''
  34. subclass_string = " <rdfs:subClassOf>#PARENT#</rdfs:subClassOf>\n"
  35. class_defined_string = ' <rdfs:isDefinedBy rdf:resource="#URI#"/>\n'
  36. object_property_template = '''
  37. <!-- http://www.h2iosc.it/onto##NAME# -->
  38. <owl:ObjectProperty rdf:about="&h2iosc;#NAME#">
  39. <rdfs:label>#LABEL#</rdfs:label>
  40. <rdfs:range rdf:resource="&h2iosc;#RANGE#"/>
  41. <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
  42. <rdfs:isDefinedBy rdf:resource="#URI#"/>
  43. </owl:ObjectProperty>
  44. '''
  45. object_property_inverse_template = '''
  46. <!-- http://www.h2iosc.it/onto##NAME# -->
  47. <owl:ObjectProperty rdf:about="&h2iosc;#NAME#">
  48. <rdfs:label>#LABEL#</rdfs:label>
  49. <owl:inverseOf rdf:resource="&h2iosc;#INV#"/>
  50. <rdfs:range rdf:resource="&h2iosc;#RANGE#"/>
  51. <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
  52. <rdfs:isDefinedBy rdf:resource="#URI#"/>
  53. </owl:ObjectProperty>
  54. '''
  55. object_defined_string = ' <rdfs:isDefinedBy rdf:resource="#URI#"/>\n'
  56. datatype_property_template = '''
  57. <!-- http://www.h2iosc.it/onto##NAME# -->
  58. <owl:DatatypeProperty rdf:about="&h2iosc;#NAME#" rdf:isDefinedBy="#URI#">
  59. <rdfs:label>#LABEL#</rdfs:label>
  60. <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
  61. <rdfs:isDefinedBy rdf:resource="#URI#"/>
  62. </owl:DatatypeProperty>
  63. '''
  64. datatype_defined_string = ' <rdfs:isDefinedBy rdf:resource="#URI#"/>\n'
  65. # Define a normalization function for rdf labels for easier portability
  66. def label_to_name(label):
  67. return label.replace(' ', '_').replace('à', 'a').replace('è', 'e').replace('é', 'e').replace('ì', 'i').replace('ò', 'o').replace('ù', 'u')
  68. # Generic ('propietary') datatypes to std. xsd datatypes mapping
  69. datatype_xsd = {
  70. "#string": 'string',
  71. '#uri': '#uri',
  72. '#number': 'decimal',
  73. '#date': 'date',
  74. '#coordinates': '#coordinates'
  75. }
  76. # %%
  77. # Map entity info to dedicated lists
  78. entities_rdf_list = []
  79. entities_csv = []
  80. datatype_properties_rdf_list = []
  81. same_as = list(data['Same_as'].keys())
  82. for label, ent in data['Entità'].items():
  83. entity_name = label_to_name(label)
  84. entity_rdf = entity_template.replace('#LABEL#', label).replace('#NAME#', entity_name)
  85. # Try to find entity in extra csv, see if there is CIDOC info and if so, map it
  86. entity_in_csv = next((ent for ent in ent_csv_in if ent['ENTITÀ']==label), None)
  87. cidoc_class = None
  88. if entity_in_csv is not None:
  89. cidoc_class = entity_in_csv.get('CIDOC-LINK')
  90. if cidoc_class is not None and cidoc_class!='':
  91. entity_rdf = entity_rdf.replace('#URI#', cidoc_class)
  92. else:
  93. entity_rdf = entity_rdf.replace(class_defined_string, '')
  94. # Subclasses
  95. if 'Sottoclasse di' in ent.keys():
  96. parent = ent['Sottoclasse di']
  97. data['Relazioni'].append({"Entità 1": label,
  98. "Entità 2": parent,
  99. "Etichetta": "is_subclass_of", "Inversa": "is_superclass_of"})
  100. entity_rdf = entity_rdf.replace('#PARENT#', label_to_name(parent))
  101. else:
  102. entity_rdf = entity_rdf.replace(subclass_string, '')
  103. entities_rdf_list.append(entity_rdf)
  104. #
  105. if label in same_as:
  106. entities_csv.append( [label, "", ', '.join(data['Same_as'][label])] )
  107. else:
  108. entities_csv.append([label, "", ""])
  109. for datatype_label, datatype_val in ent.items():
  110. if not isinstance(datatype_val, str) or not datatype_val.startswith('#'):
  111. continue
  112. entities_csv.append(["", datatype_label, ""])
  113. datatype_name = label_to_name(datatype_label)
  114. datatype_rdf = datatype_property_template.replace('#LABEL#', datatype_label).replace('#NAME#', datatype_name).replace('#DOMAIN#', entity_name)
  115. # Try to find entity in extra csv, see if there is CIDOC info and if so, map it
  116. datatype_in_csv = next((ent for ent in ent_csv_in if ent['ATTRIBUTO (LITERAL)']==datatype_label), None)
  117. cidoc_class = None
  118. if datatype_in_csv is not None:
  119. cidoc_class = datatype_in_csv.get('CIDOC-LINK')
  120. if cidoc_class is not None and cidoc_class!='':
  121. datatype_rdf = datatype_rdf.replace('#URI#', cidoc_class)
  122. else:
  123. datatype_rdf = datatype_rdf.replace(datatype_defined_string, '')
  124. datatype_properties_rdf_list.append(datatype_rdf)
  125. # Map relation info to dedicated lists
  126. relations_rdf_list = []
  127. relations_csv = []
  128. for rel in data['Relazioni']:
  129. label = rel['Etichetta']
  130. inverse_label = rel['Inversa']
  131. domain = label_to_name(rel['Entità 1'])
  132. range1 = label_to_name(rel['Entità 2'])
  133. relations_csv.append([rel['Entità 1'], rel['Entità 2'], rel['Etichetta'], rel['Inversa']])
  134. name = domain + '_' + label_to_name(label) + '_' + range1
  135. inverse_name = range1 + '_' + label_to_name(inverse_label) + '_' + domain
  136. # Try to find entity in extra csv, see if there is CIDOC info and if so, map it
  137. relation_in_csv = next((rel_csv for rel_csv in rel_csv_in if (rel_csv['ENTITÀ 1']==rel['Entità 1'] and rel_csv['ENTITÀ 2']==rel['Entità 2']) ), None)
  138. cidoc_class = None
  139. #
  140. relation_rdf = object_property_template.replace('#NAME#', name).replace('#LABEL#', label).replace('#DOMAIN#', domain).replace('#RANGE#', range1)
  141. #
  142. if relation_in_csv is not None:
  143. cidoc_class = relation_in_csv.get('CIDOC-LINK')
  144. if cidoc_class is not None and cidoc_class!='':
  145. relation_rdf = relation_rdf.replace('#URI#', cidoc_class)
  146. else:
  147. relation_rdf = relation_rdf.replace(object_defined_string, '')
  148. #
  149. relation_inverse_rdf = object_property_inverse_template.replace('#NAME#', inverse_name).replace('#LABEL#', inverse_label).replace('#DOMAIN#', range1).replace('#RANGE#', domain).replace('#INV#', name)
  150. #
  151. if cidoc_class is not None and cidoc_class!='':
  152. relation_inverse_rdf = relation_inverse_rdf.replace('#URI#', cidoc_class)
  153. else:
  154. relation_inverse_rdf = relation_inverse_rdf.replace('<rdfs:isDefinedBy rdf:resource="#URI#"/>', '')
  155. #
  156. relation_full_rdf = relation_rdf + '\n\n\n' + relation_inverse_rdf
  157. relations_rdf_list.append(relation_full_rdf)
  158. # Write info to template and export it to output file
  159. with open('draft.rdf', 'w') as out_file:
  160. to_out = raw_rdf.replace(entity_template, '\n\n\n'.join(entities_rdf_list)).replace(datatype_property_template, '\n\n\n'.join(datatype_properties_rdf_list)).replace(object_property_inverse_template, '\n\n\n'.join(relations_rdf_list))
  161. out_file.write(to_out)
  162. # %%
  163. # Write info to two csv files (one for Entities, one for Relations) for extra human readability
  164. with open('entities.csv', 'w') as out_csv:
  165. writer = csv.writer(out_csv)
  166. writer.writerow(['ENTITÀ', 'ATTRIBUTO (LITERAL)', 'SAME AS'])
  167. writer.writerows(entities_csv)
  168. with open('relations.csv', 'w') as out_csv:
  169. writer = csv.writer(out_csv)
  170. writer.writerow(['ENTITÀ 1', 'ENTITÀ 2', 'NOME RELAZIONE', 'NOME RELAZIONE INVERSA'])
  171. writer.writerows(relations_csv)
  172. # %%
  173. print(raw_rdf)
  174. # %%
  175. entity_template in raw_rdf
  176. # %%
  177. entity_template
  178. # %%