parse_onto_xlsx.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. # %%
  2. import csv
  3. import json
  4. # CONFIGURATION
  5. DATA_FOLDER = './data/'
  6. XLSX_FILENAME = 'man_draft.xlsx'
  7. ENTITIES_SHEETNAME = 'Entità'
  8. RELATIONS_SHEETNAME = 'Relazioni'
  9. #
  10. HEADER_ROW = True
  11. #
  12. ENTITIES_COLUMN_LABEL = 'ENTITÀ'
  13. ATTRIBUTES_COLUMN_LABEL = 'ATTRIBUTO (LITERAL)'
  14. SAMEAS_COLUMN_LABEL = 'SAME AS'
  15. #
  16. RELATION_FIRST_COLUMN_LABEL = 'ENTITÀ 1'
  17. RELATION_SECOND_COLUMN_LABEL = 'ENTITÀ 2'
  18. RELATION_NAME_COLUMN_LABEL = 'NOME RELAZIONE'
  19. INVERSE_RELATION_COLUMN_LABEL = 'NOME RELAZIONE INVERSA'
  20. # %%
  21. # Import xlsx through openpyxl
  22. import openpyxl as op
  23. input_data = op.load_workbook(DATA_FOLDER + XLSX_FILENAME)
  24. entities_sheet = input_data[ENTITIES_SHEETNAME]
  25. relations_sheet = input_data[RELATIONS_SHEETNAME]
  26. # %%
  27. # Conversion utility: from openpyxl object to csv-style list of dicts
  28. def sheet_to_dict_list(openpyxl_sheet, keys, header_row_local=True):
  29. if header_row_local:
  30. headers = next(openpyxl_sheet.values)
  31. indices = {key: headers.index(key) for key in keys}
  32. else:
  33. indices = {key: int(key) for key in keys}
  34. output = []
  35. for sheet_row in openpyxl_sheet.values:
  36. # Get row data as dict
  37. out_row = {key: sheet_row[indices[key]]for key in keys if indices[key]>-1}
  38. output.append(out_row)
  39. if header_row_local:
  40. output = output[1:]
  41. return output
  42. # %%
  43. # Define csv-style list of dicts for entities and relations
  44. # Collect headers (column names) for the Entities Sheet
  45. ent_keys = [ENTITIES_COLUMN_LABEL, ATTRIBUTES_COLUMN_LABEL, SAMEAS_COLUMN_LABEL]
  46. # Convert
  47. entities = sheet_to_dict_list(entities_sheet, ent_keys, HEADER_ROW)
  48. # Export
  49. basename = XLSX_FILENAME[:XLSX_FILENAME.rfind('.')]
  50. ent_filename = basename + '_entities.csv'
  51. with open(DATA_FOLDER + ent_filename, 'w') as out_file:
  52. writer = csv.DictWriter(out_file, ent_keys)
  53. writer.writeheader()
  54. writer.writerows(entities)
  55. # Collect headers (column names) for the Relations Sheet
  56. rel_keys = [RELATION_FIRST_COLUMN_LABEL, RELATION_SECOND_COLUMN_LABEL, RELATION_NAME_COLUMN_LABEL,INVERSE_RELATION_COLUMN_LABEL]
  57. # Convert
  58. relations = sheet_to_dict_list(relations_sheet, rel_keys, HEADER_ROW)
  59. # Export
  60. rel_filename = basename + '_relations.csv'
  61. with open(DATA_FOLDER + rel_filename, 'w') as out_file:
  62. writer = csv.DictWriter(out_file, rel_keys)
  63. writer.writeheader()
  64. writer.writerows(relations)
  65. # %%
  66. # From here on, work with the 'entities' and 'relations' lists of dicts
  67. def dict_lists_to_json(entities_local, relations_local):
  68. entity = {}
  69. same_as = {}
  70. current_entity = None
  71. for row in entities_local:
  72. entity_name = row.get(ENTITIES_COLUMN_LABEL)
  73. attribute_name = row.get(ATTRIBUTES_COLUMN_LABEL)
  74. same_as_row = row.get(SAMEAS_COLUMN_LABEL)
  75. same_as_list = same_as_row.split(',') if same_as_row else []
  76. if entity_name:
  77. current_entity = entity_name
  78. entity[current_entity] = {}
  79. if current_entity and attribute_name:
  80. if not entity[current_entity].get('Attributi'):
  81. entity[current_entity]['Attributi'] = []
  82. entity[current_entity]['Attributi'].append(attribute_name)
  83. if current_entity and same_as_list:
  84. same_as[current_entity] = [s.strip() for s in same_as_list]
  85. # Add subclass information
  86. for row in relations_local:
  87. entity1 = row.get(RELATION_FIRST_COLUMN_LABEL)
  88. entity2 = row.get(RELATION_SECOND_COLUMN_LABEL)
  89. label = row.get(RELATION_NAME_COLUMN_LABEL)
  90. if label == "is_subclass_of":
  91. if entity1 in entity:
  92. entity[entity1]["Sottoclasse di"] = entity2
  93. # Construct relations
  94. entity_relations = []
  95. for row in relations_local:
  96. if row['NOME RELAZIONE'] != "is_subclass_of":
  97. relation = {
  98. "Entità 1": row['ENTITÀ 1'],
  99. "Entità 2": row['ENTITÀ 2'],
  100. "Etichetta": row['NOME RELAZIONE'],
  101. "Inversa": row['NOME RELAZIONE INVERSA']
  102. }
  103. entity_relations.append(relation)
  104. # Create final JSON structure
  105. data = {
  106. "Entità": entity,
  107. "Relazioni": entity_relations,
  108. "Same_as": same_as
  109. }
  110. return data
  111. # %%
  112. json_data = dict_lists_to_json(entities, relations)
  113. with open(DATA_FOLDER + basename + '.json', 'w') as out_json:
  114. json.dump(json_data, out_json, indent=2, ensure_ascii=False)
  115. # %%