#!/usr/bin/python """Badly named script. Converts non-ascii UTF-8 text into equivalent decimal entities for use in HTML and XML. I have C and perl versions of this too. Requires Python >=2.0, >=2.3 is preferred""" # Iain Murray 2005 from fileinput import input import sys if sys.version_info[0]<2: print >> sys.stderr, "Sorry, Python >=2.0 required" sys.exit(1) if sys.version_info[0]>2 or (sys.version_info[0]==2 and sys.version_info[1]>=3): # From python 2.3 "convert to ascii with xml escapes" is easy: for txt in input(): txt=unicode(txt,'utf-8') print txt.encode('ascii','xmlcharrefreplace'), else: # Horrible version for pythons 2.0--2.2 import re def xmlescape(x): return ''.join(('&#',str(ord(x.group(0))),';')) nonascii=re.compile('[^\x00-\x7F]') for txt in input(): txt=unicode(txt,'utf-8') print nonascii.sub(xmlescape,txt).encode('ascii'),