فهرست منبع

update parsers

federicaspinelli 2 سال پیش
والد
کامیت
5eaf0722ad

+ 3 - 4
ASPO/CSV_to_RDF/datini/CSV_to_RDF_datini_item_event_exchange_receiver.ipynb

@@ -186,6 +186,9 @@
     "    writeTTLHeader(output)\n",
     "    first = True\n",
     "    ii = 0\n",
+    "    E55rplaceHolder = \"<http://www.archiviodistato.prato.it/\" + receiveLetterCoords.code + \"_\" + typeCoords.code + \">\"\n",
+    "    line = triple(E55rplaceHolder, labelCoords.prefix, \"\\\"Destinatario\\\"\" ) + closeLine\n",
+    "    output.write(line)\n",
     "    for row in reader:\n",
     "        # The index ii is used to process a limited number of entries for testing purposes\n",
     "        ii = ii+1\n",
@@ -199,8 +202,6 @@
     "        el3placeHolder = \"<http://datini.archiviodistato.prato.it/la-ricerca/scheda/\" + row['id'] + \"/\" + receiveLetterCoords.code + \">\"\n",
     "        PC14splaceHolder = \"<http://datini.archiviodistato.prato.it/la-ricerca/scheda/\" + row['id'] + \"/\" + sendLetterCoords.code + \"_\" + pcarriedByCoords.code + \">\"   \n",
     "        PC14rplaceHolder = \"<http://datini.archiviodistato.prato.it/la-ricerca/scheda/\" + row['id'] + \"/\" + receiveLetterCoords.code + \"_\" + pcarriedByCoords.code + \">\"   \n",
-    "        E55splaceHolder = \"<http://datini.archiviodistato.prato.it/la-ricerca/scheda/\" + row['id'] + \"/\" + sendLetterCoords.code + \"_\" + typeCoords.code + \">\"       \n",
-    "        E55rplaceHolder = \"<http://datini.archiviodistato.prato.it/la-ricerca/scheda/\" + row['id'] + \"/\" + receiveLetterCoords.code + \"_\" + typeCoords.code + \">\"\n",
     "        if(row['persona_destinatario'] != ''):\n",
     "            destinatario = row['persona_destinatario'].replace('{\"nome\":', '').replace(',', '').replace('\"authID\":', '')\n",
     "            name_destinatario = re.sub('IT-ASPO-AU00003-[0-9].*}', '', destinatario).replace('\"', '').replace('}', '').strip()\n",
@@ -216,8 +217,6 @@
     "            output.write(line)\n",
     "            line = triple(PC14rplaceHolder, roleOfCoords.prefix, E55rplaceHolder) + closeLine\n",
     "            output.write(line)\n",
-    "            line = triple(E55rplaceHolder, labelCoords.prefix, \"\\\"Destinatario\\\"\" ) + closeLine\n",
-    "            output.write(line)\n",
     "            line = triple(PC14rplaceHolder, hasRangeCoords.prefix, actorplaceHolder) + closeLine\n",
     "            output.write(line)\n",
     "            output.write('\\n')\n",

+ 3 - 4
ASPO/CSV_to_RDF/datini/CSV_to_RDF_datini_item_event_exchange_sender.ipynb

@@ -185,6 +185,9 @@
     "    writeTTLHeader(output)\n",
     "    first = True\n",
     "    ii = 0\n",
+    "    E55splaceHolder = \"<http://www.archiviodistato.prato.it/\" + sendLetterCoords.code + \"_\" + typeCoords.code + \">\"       \n",
+    "    line = triple(E55splaceHolder, labelCoords.prefix, \"\\\"Mittente\\\"\" ) + closeLine\n",
+    "    output.write(line)\n",
     "    for row in reader:\n",
     "        # The index ii is used to process a limited number of entries for testing purposes\n",
     "        ii = ii+1\n",
@@ -198,8 +201,6 @@
     "        el3placeHolder = \"<http://datini.archiviodistato.prato.it/la-ricerca/scheda/\" + row['id'] + \"/\" + receiveLetterCoords.code + \">\"\n",
     "        PC14splaceHolder = \"<http://datini.archiviodistato.prato.it/la-ricerca/scheda/\" + row['id'] + \"/\" + sendLetterCoords.code + \"_\" + pcarriedByCoords.code + \">\"   \n",
     "        PC14rplaceHolder = \"<http://datini.archiviodistato.prato.it/la-ricerca/scheda/\" + row['id'] + \"/\" + receiveLetterCoords.code + \"_\" + pcarriedByCoords.code + \">\"   \n",
-    "        E55splaceHolder = \"<http://datini.archiviodistato.prato.it/la-ricerca/scheda/\" + row['id'] + \"/\" + sendLetterCoords.code + \"_\" + typeCoords.code + \">\"       \n",
-    "        E55rplaceHolder = \"<http://datini.archiviodistato.prato.it/la-ricerca/scheda/\" + row['id'] + \"/\" + receiveLetterCoords.code + \"_\" + typeCoords.code + \">\"\n",
     "        if(row['persona_mittente'] != ''):\n",
     "            mittente = row['persona_mittente'].replace('{\"nome\":', '').replace(',', '').replace('\"authID\":', '')\n",
     "            name_mittente = re.sub('IT-ASPO-AU00003-[0-9].*}', '', mittente).replace('\"', '').replace('}', '').strip()\n",
@@ -215,8 +216,6 @@
     "            output.write(line)\n",
     "            line = triple(PC14splaceHolder, roleOfCoords.prefix, E55splaceHolder) + closeLine\n",
     "            output.write(line)\n",
-    "            line = triple(E55splaceHolder, labelCoords.prefix, \"\\\"Mittente\\\"\" ) + closeLine\n",
-    "            output.write(line)\n",
     "            line = triple(PC14splaceHolder, hasRangeCoords.prefix, actorplaceHolder) + closeLine\n",
     "            output.write(line)\n",
     "            output.write('\\n')\n",

+ 68 - 36
ASPO/CSV_to_RDF/datini/CSV_to_RDF_onomastica_datini.py

@@ -88,9 +88,7 @@ with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file,
         # The index ii is used to process a limited number of entries for testing purposes
         ii = ii + 1
         if row['entityType'] == 'person':
-
             id_aspo = row['recordId']
-            #placeHolders
             aspoPlaceHolder = aspoCoords.prefix + id_aspo
             line = triple(aspoPlaceHolder, 
                           nsCoords.prefix + 'type', 
@@ -183,24 +181,48 @@ with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file,
                 output.write(line)
 
             if row['occupation'] != '' and row['occupation'] != ' ' :
-                #Remove all white-space characters:
-                txt = row['occupation']
-                x = re.sub("\n", " ", txt)
-                y = re.sub("\s\s", "", x)
-                occ = re.sub(r'[^A-Za-z]','', y)
-                occupationPlaceHolder = '<http://www.archiviodistato.prato.it/' + occ.replace(" ","_") + '>'
-                line = triple(aspoPlaceHolder,
-                              schemaCoords.prefix + 'hasOccupation',
-                              occupationPlaceHolder) + closeLine
-                output.write(line)
-                line = triple(occupationPlaceHolder,
-                              nsCoords.prefix + 'type',
-                              schemaCoords.prefix + 'Occupation') + closeLine
-                output.write(line)
-                line = triple(occupationPlaceHolder,
-                              rdfsCoords.prefix + 'label',
-                              '\"' + y + '\"') + closeLine
-                output.write(line)
+                occupazioni = []
+                pipe = "|"
+                if pipe in row['occupation']:
+                    occupazioni = row['occupation'].split('|') 
+                    for occupazione in occupazioni:
+                        #Remove all white-space characters:
+                        txt = occupazione
+                        x = re.sub("\n", " ", txt)
+                        y = re.sub("\s\s", "", x)
+                        occ = re.sub(r'[^A-Za-z]','', y)
+                        occupationPlaceHolder = '<http://www.archiviodistato.prato.it/' + occ.replace(" ","_") + '>'
+                        line = triple(aspoPlaceHolder,
+                                    schemaCoords.prefix + 'hasOccupation',
+                                    occupationPlaceHolder) + closeLine
+                        output.write(line)
+                        line = triple(occupationPlaceHolder,
+                                    nsCoords.prefix + 'type',
+                                    schemaCoords.prefix + 'Occupation') + closeLine
+                        output.write(line)
+                        line = triple(occupationPlaceHolder,
+                                    rdfsCoords.prefix + 'label',
+                                    '\"' + y + '\"') + closeLine
+                        output.write(line)
+                else:
+                    #Remove all white-space characters:
+                    txt = row['occupation']
+                    x = re.sub("\n", " ", txt)
+                    y = re.sub("\s\s", "", x)
+                    occ = re.sub(r'[^A-Za-z]','', y)
+                    occupationPlaceHolder = '<http://www.archiviodistato.prato.it/' + occ.replace(" ","_") + '>'
+                    line = triple(aspoPlaceHolder,
+                                schemaCoords.prefix + 'hasOccupation',
+                                occupationPlaceHolder) + closeLine
+                    output.write(line)
+                    line = triple(occupationPlaceHolder,
+                                nsCoords.prefix + 'type',
+                                schemaCoords.prefix + 'Occupation') + closeLine
+                    output.write(line)
+                    line = triple(occupationPlaceHolder,
+                                rdfsCoords.prefix + 'label',
+                                '\"' + y + '\"') + closeLine
+                    output.write(line)
 
             if row['avo 1'] != '':
                 avo1 = '<http://www.archiviodistato.prato.it/accedi-e-consulta/aspoMV001/scheda/' + id_aspo + "/avo1>"
@@ -233,25 +255,35 @@ with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file,
                 output.write(line)
 
             if row['Qualifica'] != '':
+                qualifiche = []
+                pipe = "|"
+                if pipe in row['Qualifica']:
+                    qualifiche = row['Qualifica'].split('|') 
+                    for qualifica in qualifiche:
+                        #Remove all white-space characters:
+                        txt = qualifica
+                        x = re.sub("\n", " ", txt)
+                        y = re.sub("\s\s", " ", x)
+                        line = triple(aspoPlaceHolder, schemaCoords.prefix + 'honorificPrefix', '\"' + str(y) + '\"') + closeLine
+                        output.write(line)
+                else:
                 #Remove all white-space characters:
-                txt = row['Qualifica']
+                    txt = row['Qualifica']
+                    x = re.sub("\n", " ", txt)
+                    y = re.sub("\s\s", " ", x)
+                    line = triple(aspoPlaceHolder, schemaCoords.prefix + 'honorificPrefix', '\"' + y + '\"') + closeLine
+                    output.write(line)
+
+            if row['place_occupation_Qualifica'] != '':
+                #Remove all white-space characters:
+                txt = row['place_occupation_Qualifica']
                 x = re.sub("\n", " ", txt)
-                y = re.sub("\s\s", " ", x)
+                y = re.sub("\s\s", "", x)
                 line = triple(aspoPlaceHolder,
-                              schemaCoords.prefix + 'honorificPrefix',
-                              '\"' + y + '\"') + closeLine
+                              schemaCoords.prefix + 'workLocation',
+                              '\"' + row['place_occupation_Qualifica'].replace('\\','\\\\').replace('"','\\"') + '\"') + closeLine
                 output.write(line)
 
-            #if row['place_occupation_Qualifica'] != '':
-                #Remove all white-space characters:
-            #    txt = row['place_occupation_Qualifica']
-            #    x = re.sub("\n", " ", txt)
-            #    y = re.sub("\s\s", "", x)
-            #    line = triple(aspoPlaceHolder,
-            #                  schemaCoords.prefix + 'workLocation',
-            #                  '\"' + row['place_occupation_Qualifica'].replace('\\','\\\\').replace('"','\\"') + '\"') + closeLine
-            #    output.write(line)
-
             if row['biogHist p'] != '':
                 #Remove all white-space characters:
                 txt = row['biogHist p']
@@ -278,8 +310,8 @@ with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file,
                     owlCoords.prefix + 'sameAs',
                     aspoCoords.prefix + row['Variante']) + closeLine
                     output.write(line)
-        output.write('\n')
-        #
+            
+            output.write('\n')
         #
         # Limit number of entries processed (if desired)
         if (ii > max_entries):

+ 59 - 35
ASPO/CSV_to_RDF/ospedale/CSV_to_RDF_onomastica_ospedale.py

@@ -183,24 +183,48 @@ with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file,
                 output.write(line)
 
             if row['occupation'] != '' and row['occupation'] != ' ' :
-                #Remove all white-space characters:
-                txt = row['occupation']
-                x = re.sub("\n", " ", txt)
-                y = re.sub("\s\s", "", x)
-                occ = re.sub(r'[^A-Za-z]','', y)
-                occupationPlaceHolder = '<http://www.archiviodistato.prato.it/' + occ.replace(" ","_") + '>'
-                line = triple(aspoPlaceHolder,
-                              schemaCoords.prefix + 'hasOccupation',
-                              occupationPlaceHolder) + closeLine
-                output.write(line)
-                line = triple(occupationPlaceHolder,
-                              nsCoords.prefix + 'type',
-                              schemaCoords.prefix + 'Occupation') + closeLine
-                output.write(line)
-                line = triple(occupationPlaceHolder,
-                              rdfsCoords.prefix + 'label',
-                              '\"' + y + '\"') + closeLine
-                output.write(line)
+                occupazioni = []
+                pipe = "|"
+                if pipe in row['occupation']:
+                    occupazioni = row['occupation'].split('|') 
+                    for occupazione in occupazioni:
+                        #Remove all white-space characters:
+                        txt = occupazione
+                        x = re.sub("\n", " ", txt)
+                        y = re.sub("\s\s", "", x)
+                        occ = re.sub(r'[^A-Za-z]','', y)
+                        occupationPlaceHolder = '<http://www.archiviodistato.prato.it/' + occ.replace(" ","_") + '>'
+                        line = triple(aspoPlaceHolder,
+                                    schemaCoords.prefix + 'hasOccupation',
+                                    occupationPlaceHolder) + closeLine
+                        output.write(line)
+                        line = triple(occupationPlaceHolder,
+                                    nsCoords.prefix + 'type',
+                                    schemaCoords.prefix + 'Occupation') + closeLine
+                        output.write(line)
+                        line = triple(occupationPlaceHolder,
+                                    rdfsCoords.prefix + 'label',
+                                    '\"' + y + '\"') + closeLine
+                        output.write(line)
+                else:
+                    #Remove all white-space characters:
+                    txt = row['occupation']
+                    x = re.sub("\n", " ", txt)
+                    y = re.sub("\s\s", "", x)
+                    occ = re.sub(r'[^A-Za-z]','', y)
+                    occupationPlaceHolder = '<http://www.archiviodistato.prato.it/' + occ.replace(" ","_") + '>'
+                    line = triple(aspoPlaceHolder,
+                                schemaCoords.prefix + 'hasOccupation',
+                                occupationPlaceHolder) + closeLine
+                    output.write(line)
+                    line = triple(occupationPlaceHolder,
+                                nsCoords.prefix + 'type',
+                                schemaCoords.prefix + 'Occupation') + closeLine
+                    output.write(line)
+                    line = triple(occupationPlaceHolder,
+                                rdfsCoords.prefix + 'label',
+                                '\"' + y + '\"') + closeLine
+                    output.write(line)
 
             if row['avo 1'] != '':
                 avo1 = '<http://www.archiviodistato.prato.it/accedi-e-consulta/aspoMV001/scheda/' + id_aspo + "/avo1>"
@@ -233,24 +257,24 @@ with open(import_dir + filePrefix + fileType + '.csv', newline="") as csv_file,
                 output.write(line)
 
             if row['Qualifica'] != '':
+                qualifiche = []
+                pipe = "|"
+                if pipe in row['Qualifica']:
+                    qualifiche = row['Qualifica'].split('|') 
+                    for qualifica in qualifiche:
+                        #Remove all white-space characters:
+                        txt = qualifica
+                        x = re.sub("\n", " ", txt)
+                        y = re.sub("\s\s", " ", x)
+                        line = triple(aspoPlaceHolder, schemaCoords.prefix + 'honorificPrefix', '\"' + str(y) + '\"') + closeLine
+                        output.write(line)
+                else:
                 #Remove all white-space characters:
-                txt = row['Qualifica']
-                x = re.sub("\n", " ", txt)
-                y = re.sub("\s\s", " ", x)
-                line = triple(aspoPlaceHolder,
-                              schemaCoords.prefix + 'honorificPrefix',
-                              '\"' + y + '\"') + closeLine
-                output.write(line)
-
-            #if row['place_occupation_Qualifica'] != '':
-                #Remove all white-space characters:
-            #    txt = row['place_occupation_Qualifica']
-            #    x = re.sub("\n", " ", txt)
-            #    y = re.sub("\s\s", "", x)
-            #    line = triple(aspoPlaceHolder,
-            #                  schemaCoords.prefix + 'workLocation',
-            #                  '\"' + row['place_occupation_Qualifica'].replace('\\','\\\\').replace('"','\\"') + '\"') + closeLine
-            #    output.write(line)
+                    txt = row['Qualifica']
+                    x = re.sub("\n", " ", txt)
+                    y = re.sub("\s\s", " ", x)
+                    line = triple(aspoPlaceHolder, schemaCoords.prefix + 'honorificPrefix', '\"' + y + '\"') + closeLine
+                    output.write(line)
 
             if row['biogHist p'] != '':
                 #Remove all white-space characters:

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 235 - 82
Data/DallASPO/RDF/onomastica_datini.ttl


برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است