|
@@ -2,6 +2,8 @@
|
|
import csv
|
|
import csv
|
|
import json
|
|
import json
|
|
import openpyxl as op
|
|
import openpyxl as op
|
|
|
|
+import re
|
|
|
|
+
|
|
|
|
|
|
# BASIC CONFIGURATION
|
|
# BASIC CONFIGURATION
|
|
DATA_FOLDER = './data/'
|
|
DATA_FOLDER = './data/'
|
|
@@ -45,6 +47,7 @@ raw_relations = [{key: row[ind] for ind, key in enumerate(relations_keys)} for r
|
|
#
|
|
#
|
|
# TODO: completare secondo le specifiche sopra
|
|
# TODO: completare secondo le specifiche sopra
|
|
# TODO: effettuare il merge con i "miei" CSV, che hanno informazioni in più!
|
|
# TODO: effettuare il merge con i "miei" CSV, che hanno informazioni in più!
|
|
|
|
+# TODO: ottimizzare un po' la scrittura del codice
|
|
|
|
|
|
|
|
|
|
# Process entities:
|
|
# Process entities:
|
|
@@ -55,7 +58,7 @@ for ent in raw_entities:
|
|
entity_names = ent['Concetto']
|
|
entity_names = ent['Concetto']
|
|
if not isinstance(entity_names, str):
|
|
if not isinstance(entity_names, str):
|
|
continue
|
|
continue
|
|
- aliases = [al.strip().title() for al in entity_names.split('\n') if al.strip()]
|
|
|
|
|
|
+ aliases = [re.sub(r'\s+', ' ', al.strip().title()) for al in entity_names.split('\n') if al.strip()]
|
|
if not aliases:
|
|
if not aliases:
|
|
continue
|
|
continue
|
|
entity_name = aliases[0]
|
|
entity_name = aliases[0]
|
|
@@ -79,14 +82,14 @@ for rel in raw_relations:
|
|
obj = rel['Oggetto']
|
|
obj = rel['Oggetto']
|
|
if not isinstance(subj, str) or not isinstance(obj, str):
|
|
if not isinstance(subj, str) or not isinstance(obj, str):
|
|
continue
|
|
continue
|
|
- subj = subj.strip().title()
|
|
|
|
- obj = obj.strip().title()
|
|
|
|
|
|
+ subj = re.sub(r'\s+', ' 'subj.strip().title())
|
|
|
|
+ obj = re.sub(r'\s+', ' 'obj.strip().title())
|
|
if subj==obj:
|
|
if subj==obj:
|
|
continue
|
|
continue
|
|
|
|
|
|
rel_name = rel['Relazione']
|
|
rel_name = rel['Relazione']
|
|
if isinstance(rel_name, str):
|
|
if isinstance(rel_name, str):
|
|
- rel_name = rel_name.strip().lower()
|
|
|
|
|
|
+ rel_name = re.sub(r'\s+', '_', rel_name.strip().lower()).replace('__'. '_')
|
|
better_rel = {'Soggetto': subj, 'Relazione': rel_name, 'Oggetto': obj, 'Pair': tuple(set([subj, obj]))}
|
|
better_rel = {'Soggetto': subj, 'Relazione': rel_name, 'Oggetto': obj, 'Pair': tuple(set([subj, obj]))}
|
|
clean_relations.append(better_rel)
|
|
clean_relations.append(better_rel)
|
|
|
|
|