import json import polars as pl # Text highlighting def formatAllContexts(bibliocontexts: pl.DataFrame) -> pl.DataFrame: index = 0 for col in bibliocontexts.columns: if col.startswith('pitxt'): if index == 0: bibliocontexts = bibliocontexts.with_column( pl.struct(['formattazione contesto', 'pitxt', 'piniz', 'elemlen']).apply( lambda x: addHighlightToFormatting(x['formattazione contesto'], int(x['pitxt'] - x['piniz']), int(x['elemlen'])) ).alias('formattazione contesto') ) else: bibliocontexts = bibliocontexts.with_column( pl.struct([f'formattazione contesto', f'pitxt_{index}', 'piniz', f'elemlen_{index}']).apply( lambda x: addHighlightToFormatting(x['formattazione contesto'], int(x[f'pitxt_{index}'] - x['piniz']), int(x[f'elemlen_{index}'])) ).alias('formattazione contesto') ) index += 1 bibliocontexts = bibliocontexts.with_column( pl.struct(['contesto', 'formattazione contesto']).apply( lambda x: formatContext(x['contesto'], x['formattazione contesto']) ).alias('contesto formattato') ) return bibliocontexts # Define formatting information for a single context # Takes as input a non-formatted string + character-by-character format information def formatContext(context, formatCodesJson, strict=True): formatCodes = json.loads(formatCodesJson) if strict and len(formatCodes)!=len(context): raise ValueError("Formatting information does not match context") # Get formatting code CHANGES + first code formatChanges = [(0, formatCodes[0])] + [(ind, formatCodes[ind]) for ind in range(1, len(formatCodes)) if formatCodes[ind]!=formatCodes[ind-1]] # Define a dict containing string parts + format information for each part formattedContext = [] for ind, form in enumerate(formatChanges): format0 = [] if form[1]>0: format0 = bitToFormat(getBits(form[1])) start = form[0] end = formatChanges[ind+1][0] if ind < len(formatChanges)-1 else -1 try: stringPart = context[start:end] formattedContext.append( {'formatting': format0, 'stringPart': stringPart} ) except: pass return json.dumps(formattedContext) # For the sake of completeness, get an array of formatting codes back out of a formatted context def getFormattingCodes(formattedContext): toRet = [] for contextPart in formattedContext: stringPart = contextPart['stringPart'] code = getSingleFormattingCode(contextPart['formatting']) toRet = toRet + [code]*len(stringPart) return toRet # def getSingleFormattingCode(formatting): toRet = 0 if 'grassetto' in formatting: toRet += 1 if 'corsivo' in formatting: toRet += 2 if 'sottolineato' in formatting: toRet += 4 if 'barrato' in formatting: toRet += 8 if 'evidenziato' in formatting: toRet += 16 return toRet # Utility: modify a single format codes string to add highlighting def addHighlightToFormatting(formatCodesJson, highlightStart, highlightLength): formatCodes = json.loads(formatCodesJson) for index in range(highlightLength): try: formatCodes[highlightStart + index] += 16 except IndexError: pass return json.dumps(formatCodes) # Formatting decoders -- from numeric code to explicit formatting # Legend: ''' Formatting is set by (currently) five bits of information, codified together as an int: 0 (l.s.b.) = no formatting 1 = grassetto / bold 2 = corsivo / italic 4 = sottolineato / underline 8 = barrato / strikethroguh 16 (m.s.b.) = highlight Clearly, codes are ADDITIVE: for instance 28 = 16 + 8 + 4 stands for underlined, striked-through and highlighted text ''' def getBits(num): numOfBits = 5 aa = bin(num) # Bin converts the input to a bit string (with prefix \b) return aa[2:].rjust(numOfBits, "0") def bitToFormat(bitString): format0 = [] if bitString[-1]=="1": format0.append('grassetto') if bitString[-2]=="1": format0.append('corsivo') if bitString[-3]=="1": format0.append('sottolineato') if bitString[-4]=="1": format0.append('barrato') if bitString[-5]=="1": format0.append('evidenziato') return format0