# %%
import csv
import json
# BASIC CONFIGURATION
DATA_FOLDER = './data/'
ONTO_FILENAME = 'man_draft' # No extension!
ent_filename = ONTO_FILENAME + '_entities.csv'
rel_filename = ONTO_FILENAME + '_relations.csv'
# %%
# PART I: parse xlsx to (multiple) csv
# CONFIGURATION
XLSX_FILENAME = ONTO_FILENAME + '.xlsx'
ENTITIES_SHEETNAME = 'Entità'
RELATIONS_SHEETNAME = 'Relazioni'
# %%
# Import xlsx through openpyxl
import openpyxl as op
input_data = op.load_workbook(DATA_FOLDER + XLSX_FILENAME) # Explicitly specify the encoding??
entities_sheet = input_data[ENTITIES_SHEETNAME]
relations_sheet = input_data[RELATIONS_SHEETNAME]
# %%
# Export sheet data to csv
with open(DATA_FOLDER + ent_filename, 'w', encoding='utf-8') as out_file:
writer = csv.writer(out_file)
writer.writerows(entities_sheet.values)
with open(DATA_FOLDER + rel_filename, 'w', encoding='utf-8') as out_file:
writer = csv.writer(out_file)
writer.writerows(relations_sheet.values)
# %%
# PART II: collect csv data into a 'pre-ontology' structure
# Read csv files back in (or use them directly as starting points)
HEADER_ROW = True
# Not difficult to add more keys (column names)
ENTITIES_COLUMN_LABEL = 'ENTITÀ'
ATTRIBUTES_COLUMN_LABEL = 'ATTRIBUTO (LITERAL)'
SAMEAS_COLUMN_LABEL = 'SAME AS'
#
RELATION_FIRST_COLUMN_LABEL = 'ENTITÀ 1'
RELATION_SECOND_COLUMN_LABEL = 'ENTITÀ 2'
RELATION_NAME_COLUMN_LABEL = 'NOME RELAZIONE'
INVERSE_RELATION_COLUMN_LABEL = 'NOME RELAZIONE INVERSA'
#
with open(DATA_FOLDER + ent_filename, 'r', encoding='utf-8') as in_file:
if HEADER_ROW:
reader = csv.DictReader(in_file)
else:
reader = csv.DictReader(in_file, fieldnames=[RELATION_FIRST_COLUMN_LABEL, RELATION_SECOND_COLUMN_LABEL, RELATION_NAME_COLUMN_LABEL, INVERSE_RELATION_COLUMN_LABEL])
entities = [row for row in reader]
with open(DATA_FOLDER + rel_filename, 'r', encoding='utf-8') as in_file:
if HEADER_ROW:
reader = csv.DictReader(in_file)
else:
reader = csv.DictReader(in_file, fieldnames=[ENTITIES_COLUMN_LABEL, ATTRIBUTES_COLUMN_LABEL, SAMEAS_COLUMN_LABEL])
relations = [row for row in reader]
# %%
# From here on, work with the 'entities' and 'relations' lists of dicts. Arrange them in a nested structure, for convenience
def dict_lists_to_json(entities_local, relations_local):
entity = {}
current_entity = None
for row in entities_local:
entity_name = row.get(ENTITIES_COLUMN_LABEL)
attribute_name = row.get(ATTRIBUTES_COLUMN_LABEL)
same_as_row = row.get(SAMEAS_COLUMN_LABEL)
same_as_list = same_as_row.split(',') if same_as_row else []
if entity_name:
current_entity = entity_name
entity[current_entity] = {}
if current_entity and attribute_name:
if not entity[current_entity].get('Attributi'):
entity[current_entity]['Attributi'] = []
entity[current_entity]['Attributi'].append(attribute_name)
if current_entity and same_as_list:
entity[current_entity]['Sinonimi'] = [s.strip() for s in same_as_list]
# Add subclass information
for row in relations_local:
entity1 = row.get(RELATION_FIRST_COLUMN_LABEL)
entity2 = row.get(RELATION_SECOND_COLUMN_LABEL)
label = row.get(RELATION_NAME_COLUMN_LABEL)
if label == "is_subclass_of":
if entity1 in entity:
entity[entity1]["Sottoclasse di"] = entity2
# Construct relations
entity_relations = []
for row in relations_local:
if row[RELATION_NAME_COLUMN_LABEL] != "is_subclass_of":
relation = {
"Entità 1": row[RELATION_FIRST_COLUMN_LABEL],
"Entità 2": row[RELATION_SECOND_COLUMN_LABEL],
"Etichetta": row[RELATION_NAME_COLUMN_LABEL],
"Inversa": row[INVERSE_RELATION_COLUMN_LABEL]
}
entity_relations.append(relation)
# Create final JSON structure
data = {
"Entità": entity,
"Relazioni": entity_relations
}
return data
# %%
json_data = dict_lists_to_json(entities, relations)
# Export data
with open(DATA_FOLDER + ONTO_FILENAME + '.json', 'w') as out_json:
json.dump(json_data, out_json, indent=2, ensure_ascii=False)
# %%
# Re-read the data and do a consistency check
entity_set = set(json_data['Entità'].keys())
entity_relations_set = {ent for rel in json_data['Relazioni'] for ent in [rel['Entità 1'], rel['Entità 2']]}
# The check
if not entity_relations_set.issubset(entity_set):
print(entity_relations_set.difference(entity_set))
# Commento su #any
# %%
# RDF Templates
with open('./template.rdf', 'r') as in_file:
RAW_RDF = in_file.read()
ENTITY_TEMPLATE = '''
#LABEL#
#PARENT#
'''
SUBCLASS_STRING = " #PARENT#\n"
OBJECT_PROPERTY_TEMPLATE = '''
#LABEL#
'''
OBJECT_PROPERTY_INVERSE_TEMPLATE = '''
#LABEL#
'''
DATATYPE_PROPERTY_TEMPLATE = '''
#LABEL#
'''
# Utility
def normalize_label(label):
return label.lower().replace(' ', '_').replace('à', 'a').replace('è', 'e').replace('é', 'e').replace('ì', 'i').replace('ò', 'o').replace('ù', 'u')
# %%
# CREATE RDF OUTPUT
def create_rdf(data):
entities_rdf_list = []
datatype_properties_rdf_list = []
for label, ent in data['Entità'].items():
entity_name = normalize_label(label)
entity_rdf = ENTITY_TEMPLATE.replace('#LABEL#', label).replace('#NAME#', entity_name)
# Subclasses
if 'Sottoclasse di' in ent.keys():
parent = ent['Sottoclasse di']
data['Relazioni'].append({"Entità 1": label,
"Entità 2": parent,
"Etichetta": "is_subclass_of", "Inversa": "is_superclass_of"})
entity_rdf = entity_rdf.replace('#PARENT#', normalize_label(parent))
else:
entity_rdf = entity_rdf.replace(SUBCLASS_STRING, '')
entities_rdf_list.append(entity_rdf)
if not ent.get('Attributi'):
continue
for datatype_label in ent['Attributi']:
datatype_name = normalize_label(datatype_label)
datatype_properties_rdf_list.append(
DATATYPE_PROPERTY_TEMPLATE.replace('#LABEL#', datatype_label).replace(
'#NAME#', datatype_name
).replace('#DOMAIN#', entity_name)
)
relations_rdf_list = []
for rel in data['Relazioni']:
label = rel['Etichetta']
inverse_label = rel['Inversa']
domain = normalize_label(rel['Entità 1'])
range1 = normalize_label(rel['Entità 2'])
name = domain + '_' + normalize_label(label) + '_' + range1
inverse_name = range1 + '_' + normalize_label(inverse_label) + '_' + domain
#
relation_rdf = OBJECT_PROPERTY_TEMPLATE.replace('#NAME#', name).replace('#LABEL#', label).replace('#DOMAIN#', domain).replace('#RANGE#', range1)
#
relation_inverse_rdf = OBJECT_PROPERTY_INVERSE_TEMPLATE.replace('#NAME#', inverse_name).replace('#LABEL#', inverse_label).replace('#DOMAIN#', range1).replace('#RANGE#', domain).replace('#INV#', name)
#
relation_full_rdf = relation_rdf + '\n\n\n' + relation_inverse_rdf
relations_rdf_list.append(relation_full_rdf)
to_out = RAW_RDF.replace(ENTITY_TEMPLATE, '\n\n\n'.join(entities_rdf_list)).replace(DATATYPE_PROPERTY_TEMPLATE, '\n\n\n'.join(datatype_properties_rdf_list)
).replace(OBJECT_PROPERTY_INVERSE_TEMPLATE, '\n\n\n'.join(relations_rdf_list))
return to_out
# %%
rdf_data = create_rdf(json_data)
# Export
with open(DATA_FOLDER + ONTO_FILENAME + '.rdf', 'w') as out_file:
out_file.write(rdf_data)
# %%
# https://service.tib.eu/webvowl/
# %%