# %% import csv import json # BASIC CONFIGURATION DATA_FOLDER = './data/' OUTPUT_FOLDER = './output/' ONTO_FILENAME = 'man_draft_CIDOC' # No extension! ent_filename = ONTO_FILENAME + '_entities.csv' rel_filename = ONTO_FILENAME + '_relations.csv' # %% # PART I: parse xlsx to (multiple) csv # CONFIGURATION XLSX_FILENAME = ONTO_FILENAME + '.xlsx' ENTITIES_SHEETNAME = 'Entità' RELATIONS_SHEETNAME = 'Relazioni' # %% # Import xlsx through openpyxl import openpyxl as op input_data = op.load_workbook(DATA_FOLDER + XLSX_FILENAME) # Explicitly specify the encoding?? entities_sheet = input_data[ENTITIES_SHEETNAME] relations_sheet = input_data[RELATIONS_SHEETNAME] # %% # Export sheet data to csv with open(DATA_FOLDER + ent_filename, 'w', encoding='utf-8') as out_file: writer = csv.writer(out_file) writer.writerows(entities_sheet.values) with open(DATA_FOLDER + rel_filename, 'w', encoding='utf-8') as out_file: writer = csv.writer(out_file) writer.writerows(relations_sheet.values) # %% # PART II: collect csv data into a 'pre-ontology' structure # Read csv files back in (or use them directly as starting points) HEADER_ROW = True # Not difficult to add more keys (column names) ENTITIES_COLUMN_LABEL = 'ENTITÀ' ATTRIBUTES_COLUMN_LABEL = 'ATTRIBUTO (LITERAL)' SAMEAS_COLUMN_LABEL = 'SAME AS' # RELATION_FIRST_COLUMN_LABEL = 'ENTITÀ 1' RELATION_SECOND_COLUMN_LABEL = 'ENTITÀ 2' RELATION_NAME_COLUMN_LABEL = 'NOME RELAZIONE' INVERSE_RELATION_COLUMN_LABEL = 'NOME RELAZIONE INVERSA' # CIDOC_COLUMN_LABEL = 'CIDOC-LINK' with open(DATA_FOLDER + ent_filename, 'r', encoding='utf-8') as in_file: if HEADER_ROW: reader = csv.DictReader(in_file) else: reader = csv.DictReader(in_file, fieldnames=[ENTITIES_COLUMN_LABEL, ATTRIBUTES_COLUMN_LABEL, SAMEAS_COLUMN_LABEL, CIDOC_COLUMN_LABEL]) entities = [row for row in reader] with open(DATA_FOLDER + rel_filename, 'r', encoding='utf-8') as in_file: if HEADER_ROW: reader = csv.DictReader(in_file) else: reader = csv.DictReader(in_file, fieldnames=[RELATION_FIRST_COLUMN_LABEL, RELATION_SECOND_COLUMN_LABEL, RELATION_NAME_COLUMN_LABEL, INVERSE_RELATION_COLUMN_LABEL, CIDOC_COLUMN_LABEL]) relations = [row for row in reader] # %% # From here on, work with the 'entities' and 'relations' lists of dicts. Arrange them in a nested structure, for convenience def dict_lists_to_json(entities_local, relations_local): entity = {} current_entity = None for row in entities_local: entity_name = row.get(ENTITIES_COLUMN_LABEL) attribute_name = row.get(ATTRIBUTES_COLUMN_LABEL) same_as_row = row.get(SAMEAS_COLUMN_LABEL) same_as_list = same_as_row.split(',') if same_as_row else [] cidoc_link = row.get(CIDOC_COLUMN_LABEL) if entity_name: current_entity = entity_name entity[current_entity] = {} if cidoc_link: entity[current_entity]['Classe CIDOC proposta'] = cidoc_link if current_entity and attribute_name: if not entity[current_entity].get('Attributi'): entity[current_entity]['Attributi'] = [] attribute = {'Nome': attribute_name} if cidoc_link: attribute['Classe CIDOC proposta'] = cidoc_link entity[current_entity]['Attributi'].append(attribute) if current_entity and same_as_list: entity[current_entity]['Sinonimi'] = [s.strip() for s in same_as_list] # Add subclass information for row in relations_local: entity1 = row.get(RELATION_FIRST_COLUMN_LABEL) entity2 = row.get(RELATION_SECOND_COLUMN_LABEL) label = row.get(RELATION_NAME_COLUMN_LABEL) if label == "is_subclass_of": if entity1 in entity: entity[entity1]["Sottoclasse di"] = entity2 # Construct relations entity_relations = [] for row in relations_local: if row[RELATION_NAME_COLUMN_LABEL] != "is_subclass_of": relation = { "Entità 1": row[RELATION_FIRST_COLUMN_LABEL], "Entità 2": row[RELATION_SECOND_COLUMN_LABEL], "Etichetta": row[RELATION_NAME_COLUMN_LABEL], "Inversa": row[INVERSE_RELATION_COLUMN_LABEL], "Proprietà CIDOC proposta": row.get(CIDOC_COLUMN_LABEL) } entity_relations.append(relation) # Create final JSON structure data = { "Entità": entity, "Relazioni": entity_relations } return data # %% json_data = dict_lists_to_json(entities, relations) # Export data with open(OUTPUT_FOLDER + ONTO_FILENAME + '.json', 'w') as out_json: json.dump(json_data, out_json, indent=2, ensure_ascii=False) # %% # Re-read the data and do a consistency check entity_set = set(json_data['Entità'].keys()) entity_relations_set = {ent for rel in json_data['Relazioni'] for ent in [rel['Entità 1'], rel['Entità 2']]} # The check if not entity_relations_set.issubset(entity_set): print(entity_relations_set.difference(entity_set)) # Commento su #any # %% # RDF Templates RDF_MAIN_TEMPLATE = 'template.rdf' with open(DATA_FOLDER + RDF_MAIN_TEMPLATE, 'r') as in_file: RAW_RDF = in_file.read() # RDF snippets; info will replace placeholder tags (in uppercase between '#') ENTITY_TEMPLATE = ''' #LABEL# #PARENT# ''' SUBCLASS_STRING = " #PARENT#\n" CLASS_DEFINED_STRING = ' \n' OBJECT_PROPERTY_TEMPLATE = ''' #LABEL# ''' OBJECT_PROPERTY_INVERSE_TEMPLATE = ''' #LABEL# ''' OBJECT_DEFINED_STRING = ' \n' DATATYPE_PROPERTY_TEMPLATE = ''' #LABEL# ''' DATATYPE_DEFINED_STRING = ' \n' # Utility def normalize_label(label): return label.lower().replace(' ', '_').replace('à', 'a').replace('è', 'e').replace('é', 'e').replace('ì', 'i').replace('ò', 'o').replace('ù', 'u') # %% # CREATE RDF OUTPUT def create_rdf(data): entities_rdf_list = [] datatype_properties_rdf_list = [] for label, ent in data['Entità'].items(): entity_name = normalize_label(label) entity_rdf = ENTITY_TEMPLATE.replace('#LABEL#', label).replace('#NAME#', entity_name) # cidoc_class = ent.get('Classe CIDOC proposta') if cidoc_class: entity_rdf = entity_rdf.replace('#URI#', cidoc_class) else: entity_rdf = entity_rdf.replace(CLASS_DEFINED_STRING, '') # Subclasses if 'Sottoclasse di' in ent.keys(): parent = ent['Sottoclasse di'] data['Relazioni'].append({"Entità 1": label, "Entità 2": parent, "Etichetta": "is_subclass_of", "Inversa": "is_superclass_of"}) entity_rdf = entity_rdf.replace('#PARENT#', normalize_label(parent)) else: entity_rdf = entity_rdf.replace(SUBCLASS_STRING, '') entities_rdf_list.append(entity_rdf) if not ent.get('Attributi'): continue for datatype in ent['Attributi']: datatype_label = datatype['Nome'] datatype_name = normalize_label(datatype_label) # datatype_rdf = DATATYPE_PROPERTY_TEMPLATE.replace('#LABEL#', datatype_label).replace('#NAME#', datatype_name).replace('#DOMAIN#', entity_name) # datatype_cidoc_class = datatype.get('Classe CIDOC proposta') if datatype_cidoc_class: datatype_rdf = datatype_rdf.replace('#URI#', datatype_cidoc_class) else: datatype_rdf = datatype_rdf.replace(DATATYPE_DEFINED_STRING, '') # datatype_properties_rdf_list.append(datatype_rdf) relations_rdf_list = [] for rel in data['Relazioni']: label = rel['Etichetta'] inverse_label = rel['Inversa'] domain = normalize_label(rel['Entità 1']) range1 = normalize_label(rel['Entità 2']) name = domain + '_' + normalize_label(label) + '_' + range1 inverse_name = range1 + '_' + normalize_label(inverse_label) + '_' + domain # relation_rdf = OBJECT_PROPERTY_TEMPLATE.replace('#NAME#', name).replace('#LABEL#', label).replace('#DOMAIN#', domain).replace('#RANGE#', range1) # relation_cidoc_class = rel.get(CIDOC_COLUMN_LABEL) if relation_cidoc_class: relation_rdf = relation_rdf.replace('#URI#', relation_cidoc_class) else: relation_rdf = relation_rdf.replace(OBJECT_DEFINED_STRING, '') # relation_inverse_rdf = OBJECT_PROPERTY_INVERSE_TEMPLATE.replace('#NAME#', inverse_name).replace('#LABEL#', inverse_label).replace('#DOMAIN#', range1).replace('#RANGE#', domain).replace('#INV#', name).replace(CLASS_DEFINED_STRING, '') # relation_full_rdf = relation_rdf + '\n\n\n' + relation_inverse_rdf relations_rdf_list.append(relation_full_rdf) to_out = RAW_RDF.replace(ENTITY_TEMPLATE, '\n\n\n'.join(entities_rdf_list)).replace(DATATYPE_PROPERTY_TEMPLATE, '\n\n\n'.join(datatype_properties_rdf_list) ).replace(OBJECT_PROPERTY_INVERSE_TEMPLATE, '\n\n\n'.join(relations_rdf_list)) return to_out # %% rdf_data = create_rdf(json_data) # Export with open(OUTPUT_FOLDER + ONTO_FILENAME + '.rdf', 'w') as out_file: out_file.write(rdf_data) # %% # https://service.tib.eu/webvowl/ # %%