Francesco
/
csf_utilities


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
							# %%
import csv
import json

# BASIC CONFIGURATION
DATA_FOLDER = './data/'
OUTPUT_FOLDER = './output/'
ONTO_FILENAME = 'manoscritti_dariah' # No extension!

ent_filename = ONTO_FILENAME + '_entities.csv'
rel_filename = ONTO_FILENAME + '_relations.csv'
# %%

# %%
# PART II: collect csv data into a 'pre-ontology' structure

# Read csv files back in (or use them directly as starting points)
HEADER_ROW = True

# Not difficult to add more keys (column names)
ENTITIES_COLUMN_LABEL = 'ENTITÀ'
ATTRIBUTES_COLUMN_LABEL = 'ATTRIBUTO (LITERAL)'
SAMEAS_COLUMN_LABEL = 'SAME AS'
#
RELATION_FIRST_COLUMN_LABEL = 'ENTITÀ 1'
RELATION_SECOND_COLUMN_LABEL = 'ENTITÀ 2'
RELATION_NAME_COLUMN_LABEL = 'NOME RELAZIONE'
INVERSE_RELATION_COLUMN_LABEL = 'NOME RELAZIONE INVERSA'
#


with open(OUTPUT_FOLDER + ent_filename, 'r', encoding='utf-8') as in_file:
    if HEADER_ROW:
        reader = csv.DictReader(in_file)
    else:
        reader = csv.DictReader(in_file, fieldnames=[ENTITIES_COLUMN_LABEL, ATTRIBUTES_COLUMN_LABEL, SAMEAS_COLUMN_LABEL])
    entities = [row for row in reader]

# Post-process: check & correct homonyms in attribute names
attribute_counts = {}
for ent in entities:
    if ent.get(ATTRIBUTES_COLUMN_LABEL):
        attribute = ent[ATTRIBUTES_COLUMN_LABEL]
        if not attribute_counts.get(attribute):
            attribute_counts[attribute] = 1
        else:
            attribute_counts[attribute] = attribute_counts[attribute] + 1
#
attribute_counts = {attr: count for attr, count in attribute_counts.items() if count>1}
#
for ent in entities:
    if ent.get(ATTRIBUTES_COLUMN_LABEL):
        attribute = ent[ATTRIBUTES_COLUMN_LABEL]
        if attribute_counts.get(attribute):
            ent[ATTRIBUTES_COLUMN_LABEL] = attribute + '_' + ent[ENTITIES_COLUMN_LABEL]

# Only used for cross-check
with open(OUTPUT_FOLDER + ent_filename.replace('.csv', '_aux.csv'), 'w', encoding='utf-8', newline='\n') as out_file_aux:
    writer = csv.DictWriter(out_file_aux, fieldnames=[ENTITIES_COLUMN_LABEL, ATTRIBUTES_COLUMN_LABEL, SAMEAS_COLUMN_LABEL])
    writer.writeheader()
    writer.writerows(entities)

with open(OUTPUT_FOLDER + rel_filename, 'r', encoding='utf-8') as in_file:
    if HEADER_ROW:
        reader = csv.DictReader(in_file)
    else:
        reader = csv.DictReader(in_file, fieldnames=[RELATION_FIRST_COLUMN_LABEL, RELATION_SECOND_COLUMN_LABEL, RELATION_NAME_COLUMN_LABEL, INVERSE_RELATION_COLUMN_LABEL])
    relations = [row for row in reader]
# %%
# From here on, work with the 'entities' and 'relations' lists of dicts. Arrange them in a nested structure, for convenience

def dict_lists_to_json(entities_local, relations_local):

    entity = {}

    current_entity = None
    for row in entities_local:
        entity_name = row.get(ENTITIES_COLUMN_LABEL)
        attribute_name = row.get(ATTRIBUTES_COLUMN_LABEL)
        same_as_row = row.get(SAMEAS_COLUMN_LABEL)
        same_as_list = same_as_row.split(',') if same_as_row else []

        if entity_name:
            current_entity = entity_name
            if not entity.get(current_entity):
                entity[current_entity] = {}

        if current_entity and attribute_name:
            if not entity[current_entity].get('Attributi'):
                entity[current_entity]['Attributi'] = []
            entity[current_entity]['Attributi'].append(attribute_name)

        if current_entity and same_as_list:
            entity[current_entity]['Sinonimi'] = [s.strip() for s in same_as_list]

    # Add subclass information
    for row in relations_local:
        entity1 = row.get(RELATION_FIRST_COLUMN_LABEL)
        entity2 = row.get(RELATION_SECOND_COLUMN_LABEL)
        label = row.get(RELATION_NAME_COLUMN_LABEL)

        if label == "is_subclass_of":
            if entity1 in entity:
                entity[entity1]["Sottoclasse di"] = entity2

    # Construct relations
    entity_relations = []
    for row in relations_local:
        if row[RELATION_NAME_COLUMN_LABEL] != "is_subclass_of":
            relation = {
                "Entità 1": row[RELATION_FIRST_COLUMN_LABEL],
                "Entità 2": row[RELATION_SECOND_COLUMN_LABEL],
                "Etichetta": row[RELATION_NAME_COLUMN_LABEL],
                "Inversa": row[INVERSE_RELATION_COLUMN_LABEL]
            }
            entity_relations.append(relation)

    # Create final JSON structure
    data = {
        "Entità": entity,
        "Relazioni": entity_relations
    }

    return data
# %%
json_data = dict_lists_to_json(entities, relations)

# Export data
with open(OUTPUT_FOLDER + ONTO_FILENAME + '.json', 'w', encoding='utf-8') as out_json:
    json.dump(json_data, out_json, indent=2, ensure_ascii=False)
# %%
# Re-read the data and do a consistency check
entity_set = set(json_data['Entità'].keys())

entity_relations_set = {ent for rel in json_data['Relazioni'] for ent in [rel['Entità 1'], rel['Entità 2']]}

# The check
if not entity_relations_set.issubset(entity_set):
    print(entity_relations_set.difference(entity_set))

# Commento su #any
# %%
# RDF Templates
RDF_MAIN_TEMPLATE = 'template.rdf'
with open(DATA_FOLDER + RDF_MAIN_TEMPLATE, 'r') as in_file:
    RAW_RDF = in_file.read()

# RDF snippets; info will replace placeholder tags (in uppercase between '#')
ENTITY_TEMPLATE = '''
    <!-- http://www.h2iosc.it/onto##NAME# -->

    <owl:Class rdf:about="&h2iosc;#NAME#">
        <rdfs:label>#LABEL#</rdfs:label>
        <rdfs:subClassOf>#PARENT#</rdfs:subClassOf>
    </owl:Class>
'''
SUBCLASS_STRING = "        <rdfs:subClassOf>#PARENT#</rdfs:subClassOf>\n"

OBJECT_PROPERTY_TEMPLATE = '''
    <!-- http://www.h2iosc.it/onto##NAME# -->

    <owl:ObjectProperty rdf:about="&h2iosc;#NAME#">
        <rdfs:label>#LABEL#</rdfs:label>
        <rdfs:range rdf:resource="&h2iosc;#RANGE#"/>
        <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
    </owl:ObjectProperty>
'''

OBJECT_PROPERTY_INVERSE_TEMPLATE = '''
    <!-- http://www.h2iosc.it/onto##NAME# -->

    <owl:ObjectProperty rdf:about="&h2iosc;#NAME#">
        <rdfs:label>#LABEL#</rdfs:label>
        <owl:inverseOf rdf:resource="&h2iosc;#INV#"/>
        <rdfs:range rdf:resource="&h2iosc;#RANGE#"/>
        <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
    </owl:ObjectProperty>
'''

DATATYPE_PROPERTY_TEMPLATE = '''
    <!-- http://www.h2iosc.it/onto##NAME# -->

    <owl:DatatypeProperty rdf:about="&h2iosc;#NAME#">
        <rdfs:label>#LABEL#</rdfs:label>
        <rdfs:domain rdf:resource="&h2iosc;#DOMAIN#"/>
    </owl:DatatypeProperty>
'''

# Utility
def normalize_label(label):
    return label.lower().replace(' ', '_').replace('à', 'a').replace('è', 'e').replace('é', 'e').replace('ì', 'i').replace('ò', 'o').replace('ù', 'u')
# %%
# CREATE RDF OUTPUT
def create_rdf(data):
    entities_rdf_list = []
    datatype_properties_rdf_list = []
    for label, ent in data['Entità'].items():
        
        entity_name = normalize_label(label)
        entity_rdf = ENTITY_TEMPLATE.replace('#LABEL#', label).replace('#NAME#', entity_name)

        # Subclasses
        if 'Sottoclasse di' in ent.keys():
            parent = ent['Sottoclasse di']
            data['Relazioni'].append({"Entità 1": label,
                "Entità 2": parent,
                "Etichetta": "is_subclass_of", "Inversa": "is_superclass_of"})
            entity_rdf = entity_rdf.replace('#PARENT#', normalize_label(parent))
        else:
            entity_rdf = entity_rdf.replace(SUBCLASS_STRING, '')

        entities_rdf_list.append(entity_rdf)
        
        if not ent.get('Attributi'):
            continue
        for datatype_label in ent['Attributi']:
            datatype_name = normalize_label(datatype_label)
            datatype_properties_rdf_list.append(
                DATATYPE_PROPERTY_TEMPLATE.replace('#LABEL#', datatype_label).replace(
                    '#NAME#', datatype_name
                ).replace('#DOMAIN#', entity_name)
            )

    relations_rdf_list = []
    for rel in data['Relazioni']:
        label = rel['Etichetta']
        inverse_label = rel['Inversa']
        domain = normalize_label(rel['Entità 1'])
        range1 = normalize_label(rel['Entità 2'])
        name = domain + '_' + normalize_label(label) + '_' + range1
        inverse_name = range1 + '_' + normalize_label(inverse_label) + '_' + domain
        #
        relation_rdf = OBJECT_PROPERTY_TEMPLATE.replace('#NAME#', name).replace('#LABEL#', label).replace('#DOMAIN#', domain).replace('#RANGE#', range1)
        #
        relation_inverse_rdf = OBJECT_PROPERTY_INVERSE_TEMPLATE.replace('#NAME#', inverse_name).replace('#LABEL#', inverse_label).replace('#DOMAIN#', range1).replace('#RANGE#', domain).replace('#INV#', name)
        #
        relation_full_rdf = relation_rdf + '\n\n\n' + relation_inverse_rdf
        relations_rdf_list.append(relation_full_rdf)
    
    to_out = RAW_RDF.replace(ENTITY_TEMPLATE, '\n\n\n'.join(entities_rdf_list)).replace(DATATYPE_PROPERTY_TEMPLATE, '\n\n\n'.join(datatype_properties_rdf_list)
    ).replace(OBJECT_PROPERTY_INVERSE_TEMPLATE, '\n\n\n'.join(relations_rdf_list))

    return to_out
# %%
rdf_data = create_rdf(json_data)

# Export
with open(OUTPUT_FOLDER + ONTO_FILENAME + '.rdf', 'w') as out_file:
    out_file.write(rdf_data)
# %%
# Easy way to visualize the Ontology: upload it to:
#
# https://service.tib.eu/webvowl/

# %%