lemmatized-entities-parser.service.ts 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. import { Injectable } from '@angular/core';
  2. import { parse, ParserRegister } from '.';
  3. import {
  4. LemmatizedEntitiesList, LemmatizedEntity, LemmatizedEntityOccurrence, LemmatizedEntityOccurrenceRef, Page, XMLElement,
  5. } from '../../models/evt-models';
  6. import { isNestedInElem } from '../../utils/dom-utils';
  7. import { Map } from '../../utils/js-utils';
  8. import { GenericElemParser } from './basic-parsers';
  9. import { getLemListsToParseTagNames, lemmatizedEntitiesListsTagNamesMap } from './lemmatized-entity-parsers';
  10. import { createParser } from './parser-models';
  11. @Injectable({
  12. providedIn: 'root',
  13. })
  14. export class LemmatizedEntitiesParserService {
  15. private tagLemNamesMap = lemmatizedEntitiesListsTagNamesMap;
  16. public parseLemLists(document: XMLElement) {
  17. const lemListsToParse = getLemListsToParseTagNames();
  18. const lemListParser = ParserRegister.get('evt-lemmatized-entities-list-parser');
  19. // We consider only first level lists; inset lists will be considered
  20. const lemlists = Array.from(document.querySelectorAll<XMLElement>(lemListsToParse.toString()))
  21. .filter((lemlist) => !isNestedInElem(lemlist, lemlist.tagName))
  22. .map((l) => lemListParser.parse(l) as LemmatizedEntitiesList);
  23. return {
  24. lemlists,
  25. entities: lemlists.map(({ content }) => content).reduce((a, b) => a.concat(b), []),
  26. relations: lemlists.map(({ relations }) => relations).reduce((a, b) => a.concat(b), []),
  27. };
  28. }
  29. public getResultsByType(lemlists: LemmatizedEntitiesList[], entities: LemmatizedEntity[], type: string[]) {
  30. return {
  31. lemlists: lemlists.filter(list => type.indexOf(list.lemmatizedEntityType) >= 0),
  32. entities: entities.filter(entity => type.indexOf(entity.lemmatizedEntityType) >= 0),
  33. };
  34. }
  35. public parseLemmatizedEntitiesOccurrences(pages: Page[]) {
  36. return pages.map(p => this.getLemmatizedEntitiesOccurrencesInPage(p))
  37. .reduce(
  38. (x, y) => {
  39. Object.keys(y).forEach(k => {
  40. if (x[k]) {
  41. x[k] = x[k].concat([y[k]]);
  42. } else {
  43. x[k] = [y[k]];
  44. }
  45. });
  46. return x;
  47. },
  48. {});
  49. }
  50. public getLemmatizedEntitiesOccurrencesInPage(p: Page): Array<Map<LemmatizedEntityOccurrence>> {
  51. return p.originalContent
  52. .filter(e => e.nodeType === 1)
  53. .map(e => {
  54. const occurrences = [];
  55. if (this.tagLemNamesMap.occurrences.indexOf(e.tagName) >= 0 && e.getAttribute('ref')) { // Handle first level page contents
  56. occurrences.push(this.parseLemmatizedEntityOccurrence(e));
  57. }
  58. return occurrences.concat(Array.from(e.querySelectorAll<XMLElement>(this.tagLemNamesMap.occurrences))
  59. .map(el => this.parseLemmatizedEntityOccurrence(el)));
  60. })
  61. .filter(e => e.length > 0)
  62. .reduce((x, y) => x.concat(y), [])
  63. .reduce(
  64. (x, y) => {
  65. const refsByDoc: LemmatizedEntityOccurrenceRef[] = x[y.ref] ? x[y.ref].refsByDoc || [] : [];
  66. const docRefs = refsByDoc.find(r => r.docId === y.docId);
  67. if (docRefs) {
  68. docRefs.refs.push(y.el);
  69. } else {
  70. refsByDoc.push({
  71. docId: y.docId,
  72. refs: [y.el],
  73. docLabel: y.docLabel,
  74. });
  75. }
  76. return {
  77. ...x, [y.ref]: {
  78. pageId: p.id,
  79. pageLabel: p.label,
  80. refsByDoc,
  81. },
  82. } as Array<Map<LemmatizedEntityOccurrence>>;
  83. },
  84. {});
  85. }
  86. private parseLemmatizedEntityOccurrence(xml: XMLElement) {
  87. const doc = xml.closest('text');
  88. const elementParser = createParser(GenericElemParser, parse);
  89. return {
  90. ref: xml.getAttribute('ref').replace('#', ''),
  91. el: elementParser.parse(xml),
  92. docId: doc ? doc.getAttribute('xml:id') : '', // TODO: get proper document id when missing
  93. docLabel: doc ? doc.getAttribute('n') || doc.getAttribute('xml:id') : '', // TODO: get proper document label when attributes missing
  94. };
  95. }
  96. }