structure-xml-parser.service.ts 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. import { Injectable } from '@angular/core';
  2. import { EditionStructure, GenericElement, OriginalEncodingNodeType, Page, XMLElement } from '../../models/evt-models';
  3. import { createNsResolver, getElementsBetweenTreeNode, isNestedInElem } from '../../utils/dom-utils';
  4. import { GenericParserService } from './generic-parser.service';
  5. import { getID, ParseResult } from './parser-models';
  6. @Injectable({
  7. providedIn: 'root',
  8. })
  9. export class StructureXmlParserService {
  10. constructor(
  11. private genericParserService: GenericParserService,
  12. ) {
  13. }
  14. private frontOrigContentAttr = 'document_front';
  15. readonly frontTagName = 'front';
  16. readonly pageTagName = 'pb';
  17. readonly bodyTagName = 'body';
  18. parsePages(el: XMLElement): EditionStructure {
  19. if (!el) { return { pages: [] }; }
  20. const front: XMLElement = el.querySelector(this.frontTagName);
  21. const body: XMLElement = el.querySelector(this.bodyTagName);
  22. const pbs = Array.from(el.querySelectorAll(this.pageTagName)).filter((p) => !p.getAttribute('ed'));
  23. const frontPbs = pbs.filter((p) => isNestedInElem(p, this.frontTagName));
  24. const bodyPbs = pbs.filter((p) => isNestedInElem(p, this.bodyTagName));
  25. const doc = el.firstElementChild.ownerDocument;
  26. if (frontPbs.length > 0 && bodyPbs.length > 0) {
  27. return {
  28. pages: pbs.map((pb: XMLElement, idx, arr: XMLElement[]) => this.parseDocumentPage(doc, pb, arr[idx + 1], 'text')),
  29. };
  30. }
  31. const frontPages = frontPbs.length === 0 && front && this.isMarkedAsOrigContent(front)
  32. ? [this.parseSinglePage(doc, front, 'page_front', this.frontTagName)]
  33. : frontPbs.map((pb, idx, arr) => this.parseDocumentPage(doc, pb as HTMLElement, arr[idx + 1] as HTMLElement, this.frontTagName));
  34. const bodyPages = bodyPbs.length === 0
  35. ? [this.parseSinglePage(doc, body, 'page1', 'mainText')] // TODO: tranlsate mainText
  36. : bodyPbs.map((pb, idx, arr) => this.parseDocumentPage(doc, pb as HTMLElement, arr[idx + 1] as HTMLElement, this.bodyTagName));
  37. return {
  38. pages: [...frontPages, ...bodyPages],
  39. };
  40. }
  41. parseDocumentPage(doc: Document, pb: XMLElement, nextPb: XMLElement, ancestorTagName: string): Page {
  42. /* If there is a next page we retrieve the elements between two page nodes
  43. otherweise we retrieve the nodes between the page node and the last node of the body node */
  44. // TODO: check if querySelectorAll can return an empty array in this case
  45. const nextNode = nextPb || Array.from(doc.querySelectorAll(ancestorTagName)).reverse()[0].lastChild;
  46. const originalContent = getElementsBetweenTreeNode(pb, nextNode)
  47. .filter((n) => n.tagName !== this.pageTagName)
  48. .filter((c) => ![4, 7, 8].includes(c.nodeType)); // Filter comments, CDATAs, and processing instructions
  49. return {
  50. id: getID(pb, 'page'),
  51. label: pb.getAttribute('n') || 'page',
  52. originalContent,
  53. parsedContent: this.parsePageContent(doc, originalContent),
  54. };
  55. }
  56. private parseSinglePage(doc: Document, el: XMLElement, id: string, label: string): Page {
  57. const originalContent: XMLElement[] = getElementsBetweenTreeNode(el.firstChild, el.lastChild);
  58. return {
  59. id,
  60. label,
  61. originalContent,
  62. parsedContent: this.parsePageContent(doc, originalContent),
  63. };
  64. }
  65. parsePageContent(doc: Document, pageContent: OriginalEncodingNodeType[]): Array<ParseResult<GenericElement>> {
  66. return pageContent
  67. .map((node) => {
  68. const origEl = getEditionOrigNode(node, doc);
  69. if (origEl.nodeName === this.frontTagName || isNestedInElem(origEl, this.frontTagName)) {
  70. if (this.hasOriginalContent(origEl)) {
  71. return Array.from(node.querySelectorAll(`[type=${this.frontOrigContentAttr}]`))
  72. .map((c) => this.genericParserService.parse(c as XMLElement));
  73. }
  74. if (this.isMarkedAsOrigContent(origEl)) {
  75. return [this.genericParserService.parse(node)];
  76. }
  77. return [] as Array<ParseResult<GenericElement>>;
  78. }
  79. if (origEl.tagName === 'text' && origEl.querySelectorAll && origEl.querySelectorAll(this.frontTagName).length > 0) {
  80. return this.parsePageContent(doc, Array.from(node.children) as HTMLElement[]);
  81. }
  82. return [this.genericParserService.parse(node)];
  83. })
  84. .reduce((x, y) => x.concat(y), []);
  85. }
  86. hasOriginalContent(el: XMLElement): boolean {
  87. return el.querySelectorAll(`[type=${this.frontOrigContentAttr}]`).length > 0;
  88. }
  89. isMarkedAsOrigContent(el: XMLElement): boolean {
  90. return el.nodeType !== 3 &&
  91. (el.getAttribute('type') === this.frontOrigContentAttr ||
  92. this.hasOriginalContent(el) ||
  93. isNestedInElem(el, '', [{ key: 'type', value: this.frontOrigContentAttr }])
  94. );
  95. }
  96. }
  97. function getEditionOrigNode(el: XMLElement, doc: Document) {
  98. if (el.getAttribute && el.getAttribute('xpath')) {
  99. const path = doc.documentElement.namespaceURI ? el.getAttribute('xpath').replace(/\//g, '/ns:') : el.getAttribute('xpath');
  100. const xpathRes = doc.evaluate(path, doc, createNsResolver(doc), XPathResult.ANY_TYPE, undefined);
  101. return xpathRes.iterateNext() as XMLElement;
  102. }
  103. return el;
  104. }