123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- # %%
- import csv
- import json
- # CONFIGURATION
- DATA_FOLDER = './data/'
- XLSX_FILENAME = 'man_draft.xlsx'
- ENTITIES_SHEETNAME = 'Entità'
- RELATIONS_SHEETNAME = 'Relazioni'
- #
- HEADER_ROW = True
- #
- ENTITIES_COLUMN_LABEL = 'ENTITÀ'
- ATTRIBUTES_COLUMN_LABEL = 'ATTRIBUTO (LITERAL)'
- SAMEAS_COLUMN_LABEL = 'SAME AS'
- #
- RELATION_FIRST_COLUMN_LABEL = 'ENTITÀ 1'
- RELATION_SECOND_COLUMN_LABEL = 'ENTITÀ 2'
- RELATION_NAME_COLUMN_LABEL = 'NOME RELAZIONE'
- INVERSE_RELATION_COLUMN_LABEL = 'NOME RELAZIONE INVERSA'
- # %%
- # Import xlsx through openpyxl
- import openpyxl as op
- input_data = op.load_workbook(DATA_FOLDER + XLSX_FILENAME)
- entities_sheet = input_data[ENTITIES_SHEETNAME]
- relations_sheet = input_data[RELATIONS_SHEETNAME]
- # %%
- # Conversion utility: from openpyxl object to csv-style list of dicts
- def sheet_to_dict_list(openpyxl_sheet, keys, header_row_local=True):
- if header_row_local:
- headers = next(openpyxl_sheet.values)
- indices = {key: headers.index(key) for key in keys}
- else:
- indices = {key: int(key) for key in keys}
- output = []
- for sheet_row in openpyxl_sheet.values:
- # Get row data as dict
- out_row = {key: sheet_row[indices[key]]for key in keys if indices[key]>-1}
- output.append(out_row)
- if header_row_local:
- output = output[1:]
- return output
- # %%
- # Define csv-style list of dicts for entities and relations
- # Collect headers (column names) for the Entities Sheet
- ent_keys = [ENTITIES_COLUMN_LABEL, ATTRIBUTES_COLUMN_LABEL, SAMEAS_COLUMN_LABEL]
- # Convert
- entities = sheet_to_dict_list(entities_sheet, ent_keys, HEADER_ROW)
- # Export
- basename = XLSX_FILENAME[:XLSX_FILENAME.rfind('.')]
- ent_filename = basename + '_entities.csv'
- with open(DATA_FOLDER + ent_filename, 'w') as out_file:
- writer = csv.DictWriter(out_file, ent_keys)
- writer.writeheader()
- writer.writerows(entities)
- # Collect headers (column names) for the Relations Sheet
- rel_keys = [RELATION_FIRST_COLUMN_LABEL, RELATION_SECOND_COLUMN_LABEL, RELATION_NAME_COLUMN_LABEL,INVERSE_RELATION_COLUMN_LABEL]
- # Convert
- relations = sheet_to_dict_list(relations_sheet, rel_keys, HEADER_ROW)
- # Export
- rel_filename = basename + '_relations.csv'
- with open(DATA_FOLDER + rel_filename, 'w') as out_file:
- writer = csv.DictWriter(out_file, rel_keys)
- writer.writeheader()
- writer.writerows(relations)
- # %%
- # From here on, work with the 'entities' and 'relations' lists of dicts
- def dict_lists_to_json(entities_local, relations_local):
- entity = {}
- same_as = {}
- current_entity = None
- for row in entities_local:
- entity_name = row.get(ENTITIES_COLUMN_LABEL)
- attribute_name = row.get(ATTRIBUTES_COLUMN_LABEL)
- same_as_row = row.get(SAMEAS_COLUMN_LABEL)
- same_as_list = same_as_row.split(',') if same_as_row else []
- if entity_name:
- current_entity = entity_name
- entity[current_entity] = {}
- if current_entity and attribute_name:
- if not entity[current_entity].get('Attributi'):
- entity[current_entity]['Attributi'] = []
- entity[current_entity]['Attributi'].append(attribute_name)
- if current_entity and same_as_list:
- same_as[current_entity] = [s.strip() for s in same_as_list]
- # Add subclass information
- for row in relations_local:
- entity1 = row.get(RELATION_FIRST_COLUMN_LABEL)
- entity2 = row.get(RELATION_SECOND_COLUMN_LABEL)
- label = row.get(RELATION_NAME_COLUMN_LABEL)
- if label == "is_subclass_of":
- if entity1 in entity:
- entity[entity1]["Sottoclasse di"] = entity2
- # Construct relations
- entity_relations = []
- for row in relations_local:
- if row['NOME RELAZIONE'] != "is_subclass_of":
- relation = {
- "Entità 1": row['ENTITÀ 1'],
- "Entità 2": row['ENTITÀ 2'],
- "Etichetta": row['NOME RELAZIONE'],
- "Inversa": row['NOME RELAZIONE INVERSA']
- }
- entity_relations.append(relation)
- # Create final JSON structure
- data = {
- "Entità": entity,
- "Relazioni": entity_relations,
- "Same_as": same_as
- }
- return data
- # %%
- json_data = dict_lists_to_json(entities, relations)
- with open(DATA_FOLDER + basename + '.json', 'w') as out_json:
- json.dump(json_data, out_json, indent=2, ensure_ascii=False)
- # %%
|