# example.py # # Example of some tricky sanitization problems # A tricky string s = 'p\xfdt\u0125\xf6\xf1\x0cis\tawesome\r\n' print(s) # (a) Remapping whitespace remap = { ord('\t') : ' ', ord('\f') : ' ', ord('\r') : None # Deleted } a = s.translate(remap) print('whitespace remapped:', a) # (b) Remove all combining characters/marks import unicodedata import sys cmb_chrs = dict.fromkeys(c for c in range(sys.maxunicode) if unicodedata.combining(chr(c))) b = unicodedata.normalize('NFD', a) c = b.translate(cmb_chrs) print('accents removed:', c) # (c) Accent removal using I/O decoding d = b.encode('ascii','ignore').decode('ascii') print('accents removed via I/O:', d)