format.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. import json
  2. import pandas as pd
  3. # Text highlighting
  4. def formatAllContexts(bibliocontexts: pd.DataFrame):
  5. index = 0
  6. for col in bibliocontexts.columns:
  7. if col.startswith('pitxt'):
  8. if index == 0:
  9. bibliocontexts['formattazione contesto'] = bibliocontexts.apply (lambda row: addHighlightToFormatting(row['formattazione contesto'], int(row['pitxt'] - row['piniz']), int(row['elemlen'])), axis=1)
  10. else:
  11. bibliocontexts['formattazione contesto'] = bibliocontexts.apply (lambda row: addHighlightToFormatting(row['formattazione contesto'], int(row['pitxt_'+str(index)] - row['piniz']), int(row['elemlen_'+str(index)])), axis=1)
  12. index += 1
  13. bibliocontexts['contesto formattato'] = bibliocontexts.apply (lambda row: formatContext(row['contesto'], row['formattazione contesto']), axis=1)
  14. return bibliocontexts
  15. # Define formatting information for a single context
  16. # Takes as input a non-formatted string + character-by-character format information
  17. def formatContext(context, formatCodesJson, strict=True):
  18. formatCodes = json.loads(formatCodesJson)
  19. if strict and len(formatCodes)!=len(context):
  20. raise ValueError("Formatting information does not match context")
  21. # Get formatting code CHANGES + first code
  22. formatChanges = [(0, formatCodes[0])] + [(ind, formatCodes[ind]) for ind in range(1, len(formatCodes)) if formatCodes[ind]!=formatCodes[ind-1]]
  23. # Define a dict containing string parts + format information for each part
  24. formattedContext = []
  25. for ind, form in enumerate(formatChanges):
  26. format0 = []
  27. if form[1]>0:
  28. format0 = bitToFormat(getBits(form[1]))
  29. start = form[0]
  30. end = formatChanges[ind+1][0] if ind < len(formatChanges)-1 else -1
  31. try:
  32. stringPart = context[start:end]
  33. formattedContext.append( {'formatting': format0, 'stringPart': stringPart} )
  34. except:
  35. pass
  36. return json.dumps(formattedContext)
  37. # For the sake of completeness, get an array of formatting codes back out of a formatted context
  38. def getFormattingCodes(formattedContext):
  39. toRet = []
  40. for contextPart in formattedContext:
  41. stringPart = contextPart['stringPart']
  42. code = getSingleFormattingCode(contextPart['formatting'])
  43. toRet = toRet + [code]*len(stringPart)
  44. return toRet
  45. #
  46. def getSingleFormattingCode(formatting):
  47. toRet = 0
  48. if 'grassetto' in formatting:
  49. toRet += 1
  50. if 'corsivo' in formatting:
  51. toRet += 2
  52. if 'sottolineato' in formatting:
  53. toRet += 4
  54. if 'barrato' in formatting:
  55. toRet += 8
  56. if 'evidenziato' in formatting:
  57. toRet += 16
  58. return toRet
  59. # Utility: modify a single format codes string to add highlighting
  60. def addHighlightToFormatting(formatCodesJson, highlightStart, highlightLength):
  61. formatCodes = json.loads(formatCodesJson)
  62. for index in range(highlightLength):
  63. try:
  64. formatCodes[highlightStart + index] += 16
  65. except IndexError:
  66. pass
  67. return json.dumps(formatCodes)
  68. # Formatting decoders -- from numeric code to explicit formatting
  69. # Legend:
  70. '''
  71. Formatting is set by (currently) five bits of information, codified together
  72. as an int:
  73. 0 (l.s.b.) = no formatting
  74. 1 = grassetto / bold
  75. 2 = corsivo / italic
  76. 4 = sottolineato / underline
  77. 8 = barrato / strikethroguh
  78. 16 (m.s.b.) = highlight
  79. Clearly, codes are ADDITIVE: for instance 28 = 16 + 8 + 4 stands for
  80. underlined, striked-through and highlighted text
  81. '''
  82. def getBits(num):
  83. numOfBits = 5
  84. aa = bin(num) # Bin converts the input to a bit string (with prefix \b)
  85. return aa[2:].rjust(numOfBits, "0")
  86. def bitToFormat(bitString):
  87. format0 = []
  88. if bitString[-1]=="1":
  89. format0.append('grassetto')
  90. if bitString[-2]=="1":
  91. format0.append('corsivo')
  92. if bitString[-3]=="1":
  93. format0.append('sottolineato')
  94. if bitString[-4]=="1":
  95. format0.append('barrato')
  96. if bitString[-5]=="1":
  97. format0.append('evidenziato')
  98. return format0