Entry
Translate text into HTML
Jul 6th, 2004 06:52
amal k m, Nathan Wallace, unknown unknown, Hans Nowak, Snippet 115, Glyn Webster
"""
Packages: text.html
"""
""" Function to translate text from various character sets in to HTML
XXX I don't have a very good understanding of what some of the more
esoteric non-ISO-Latin-1 characters in the "Mac Roman" char set are for,
so my HTML substutions for them may not be the best.
Glyn Webster <glyn@ninz.org.nz> 1999-03-ish
"""
# First, some tables mapping character-code to HTML entity for
# the ISO_LATIN_1, IBM_PC_EXTENDED and MAC_ROMAN character sets:
# (Characters that don't have HTML replacements, or don't need
# them, aren't in the tables.)
ISO_LATIN_1 = {
9: ' ', #(tab)
10: '\r', #(linefeed)
12: '', #(formfeed)
13: '\n', #(return)
32: ' ', #(space)
34: '"', 38: '&', 60: '<', 62: '>',
160: ' ', 161: '¡', 162: '¢', 163: '£',
164: '¤', 165: '¥', 166: '¦', 167: '§',
168: '¨', 169: '©', 170: 'ª', 171: '«',
172: '¬', 173: '', 174: '®', 175: '¯',
176: '°', 177: '±', 178: '²', 179: '³',
180: '´', 181: 'µ', 182: '¶', 183: '·',
184: '¸', 185: '¹', 186: 'º', 187: '»',
188: '¼', 189: '½', 190: '¾', 191: '¿',
192: 'À', 193: 'Á', 194: 'Â', 195: 'Ã',
196: 'Ä', 197: 'Å', 198: 'Æ', 199: 'Ç',
200: 'È', 201: 'É', 202: 'Ê', 203: 'Ë',
204: 'Ì', 205: 'Í', 206: 'Î', 207: 'Ï',
208: 'Ð', 209: 'Ñ', 210: 'Ò', 211: 'Ó',
212: 'Ô', 213: 'Õ', 214: 'Ö', 215: '×',
216: 'Ø', 217: 'Ù', 218: 'Ú', 219: 'Û',
220: 'Ü', 221: 'Ý', 222: 'Þ', 223: 'ß',
224: 'à', 225: 'á', 226: 'â', 227: 'ã',
228: 'ä', 229: 'å', 230: 'æ', 231: 'ç',
232: 'è', 233: 'é', 234: 'ê', 235: 'ë',
236: 'ì', 237: 'í', 238: 'î', 239: 'ï',
240: 'ð', 241: 'ñ', 242: 'ò', 243: 'ó',
244: 'ô', 245: 'õ', 246: 'ö', 247: '÷',
248: 'ø', 249: 'ù', 250: 'ú', 251: 'û',
252: 'ü', 253: 'ý', 254: 'þ', 255: 'ÿ'
}
IBM_PC_EXTENDED = {
9: ' ', #(tab)
10: '\r', #(linefeed)
12: '', #(formfeed)
13: '\n', #(return)
32: ' ', #(space)
15: '¤', 20: '¶', 21: '§', 34: '"',
38: '&', 60: '<', 62: '>', 128: 'Ç',
131: 'â', 132: 'ä', 133: 'à', 134: 'å',
135: 'ç', 136: 'ê', 137: 'ë', 138: 'è',
139: 'ï', 140: 'î', 141: 'ì', 143: 'Å',
144: 'É', 145: 'æ', 146: 'Æ', 149: 'ò',
150: 'û', 151: 'ù', 152: 'ÿ', 153: 'Ö',
154: 'Ü', 155: '¢', 156: '£', 157: '¥',
160: 'á', 161: 'í', 162: 'ó', 163: 'ú',
164: 'ñ', 165: 'Ñ', 166: 'ª', 167: 'º',
168: '¿', 170: '¬', 171: '½', 172: '¼',
173: '¡', 174: '«', 175: '»', 142: 'Ä',
147: 'ô', 148: 'ö', 230: 'µ', 237: 'ø',
241: '±', 246: '÷', 248: '°', 249: '·',
253: '²',
# +--+
# Box-drawing characters translate to | | style boxes.
# +--+
179: "|", 180: "+", 181: "+", 182: "+", 183: "+", 184: "+",
185: "+", 186: "|", 187: "+", 188: "+", 189: "+", 190: "+",
191: "+", 192: "+", 193: "+", 194: "+", 195: "+", 196: "-",
197: "+", 198: "+", 199: "+", 200: "+", 201: "+", 202: "+",
203: "+", 204: "+", 205: "-", 206: "+", 207: "+", 208: "+",
209: "+", 210: "+", 211: "+", 212: "+", 213: "+", 214: "+",
215: "+", 216: "+", 217: "+", 218: "+",
# The remaining characters don't map onto the ISO Latin character set.
# I've ignored most. Some seem to have senible substitutions, YMMV.
# (Subsituting them with little GIFs might be a better solution.)
236: "INF", #(ì infinity)
242: ">=", #(ò greater-than-equals)
243: "<=", #(ó less-than-equals)
252: "<sup>n</sup>", #(ü superscript n)
}
MAC_ROMAN = {
9: ' ', #(tab)
10: '\r', #(linefeed)
12: '', #(formfeed)
13: '\n', #(return)
32: ' ', #(space)
128: "Ä", 129: "Å", 130: "Ç", 131: "É",
132: "Ñ", 133: "Ö", 134: "Ü", 135: "á",
136: "à", 137: "â", 138: "ä", 139: "ã",
140: "å", 141: "ç", 142: "é", 143: "è",
144: "ê", 145: "ë", 146: "í", 147: "ì",
148: "î", 149: "î", 150: "ñ", 151: "ó",
152: "ò", 153: "ô", 154: "ö", 155: "õ",
156: "ú", 157: "ù", 158: "û", 159: "ù",
161: "°", 162: "¢", 163: "£", 164: "§",
165: "·", 166: "¶", 167: "ß", 168: "®",
169: "©", 171: "´", 172: "&um;;", 174: "Æ",
175: "Ø", 180: "¥", 181: "µ", 187: "ª",
188: "º", 190: "æ", 191: "ø", 192: "¿",
194: "¬", 199: "»", 200: "«", 203: "À",
204: "Ã", 205: "Õ", 206: "Œ", 207: "œ",
214: "÷", 216: "&ytilde;", 217: "&Ytilde;", 219: "¤",
225: "middot;", 229: "Â", 230: "Ê", 231: "Á",
232: "&Etilde;", 233: "È", 234: "Í", 235: "Î",
236: "Ï", 237: "Ì", 238: "Ó", 239: "Ò",
241: "Ò", 242: "Ú", 243: "Û", 244: "Ù",
246: "ˆ", 247: "˜", 248: "¯", 252: "¸",
# These Mac Roman characters don't map onto the ISO Latin charater
set.
# I've tried to give them sensible substitutions, but I'm not really
# happy with a lot of them. (Subsituting them with little GIFs might
be
# a better solution.)
#(These look right if you're using a MAC.)
160: "<sup>**</sup>", #( cross footnote mark)
177: "I", #(± I with macron)
170: "<sup>TM</sup>", #(ª trademark)
173: "", #( not equals)
176: "INF", #(° infinity)
178: "<=", #(² less than or equals)
179: ">=", #(³ greater than or equals)
182: "d", #(¶ no idea what this is)
183: "SIGMA", #(· capital sigma)
184: "PI", #(¸ capital pi)
185: "pi", #(¹ small pi)
186: "º", #(º integration sign)
189: "OMEGA", #(½ capital omega)
193: "i", #(Á looks like a letter i to me)
195: "SQRT", #(Ã square root)
196: "f", #(Ä looks like an f with a decender)
198: "DELTA", #(Æ capital delta)
201: "...", #(É elipsis)
202: "Ê", #(Ê looks like a blank (different spacing?))
208: "-", #(Ð dash, short)
209: "--", #(Ñ dash, long)
210: """, #(Ò opening double quote)
211: """, #(Ó closing double quote)
212: "`", #(Ô opening single quote)
213: "'", #(Õ closing single quote)
215: ">lt;", #(× dimond)
218: "/", #(Ú slash)
220: "(", #(Ü looks like a small opening bracket)
221: ")", #(Ý looks like a small closing bracket)
222: "fi", #(Þ printers' fi ligiture)
223: "fl", #(ß printers' fl ligiture)
224: "<sup>***</sup>", #(à double-cross footnote mark)
228: "<sup>o</sup>/oo", #(ä thousandth symbol (percent with
extra 'o'))
245: "<sub>1</sub>", #(õ subscript one)
# Accents. None of the Web browsers I've seen let these charcters
# over-strike the previous character like I think they're supposed to,
# so it seems better to leave them out.
226: "", #(â accent, looks like a ,)
227: "", #(ã accent, looks like a ,,)
249: "", #(ù short vowel mark)
250: "", #(ú dot accent, I think)
251: "", #(û ring accent)
253: "", #(ý double-acute acent, I think)
254: "", #(þ backwards cedila, I think)
255: "" #(ÿ upside down ^ accent)
}
def htmlesc(s, charset = ISO_LATIN_1):
""" Replaces <, >, &, " and non-ascii characters in string 's'
with their HTML 'character entity' strings. Non-ISO-Latin1
characters that don't have a substitution become dots.
"""
result = ''
for c in s :
if 32 <= ord(c) <= 126:
result = result + charset.get(ord(c), c)
else:
result = result + charset.get(ord(c), "·")
return result
http://www.cheapinternationalflights.netfirms.com