faqts : Computers : Programming : Languages : Python : Snippets : Web Programming / Manipulating HTML files

+ Search
Add Entry AlertManage Folder Edit Entry Add page to http://del.icio.us/
Did You Find This Entry Useful?

29 of 33 people (88%) answered Yes
Recently 8 of 10 people (80%) answered Yes

Entry

Translate text into HTML

Jul 6th, 2004 06:52
amal k m, Nathan Wallace, unknown unknown, Hans Nowak, Snippet 115, Glyn Webster


"""
Packages: text.html
"""
""" Function to translate text from various character sets in to HTML
XXX I don't have a very good understanding of what some of the more
esoteric non-ISO-Latin-1 characters in the "Mac Roman" char set are for,
so my HTML substutions for them may not be the best.
Glyn Webster <glyn@ninz.org.nz> 1999-03-ish
"""
# First, some tables mapping character-code to HTML entity for
# the ISO_LATIN_1, IBM_PC_EXTENDED and MAC_ROMAN character sets:
# (Characters that don't have HTML replacements, or don't need
# them, aren't in the tables.)
ISO_LATIN_1 = {
    9:  ' ',  #(tab)
   10: '\r',  #(linefeed)
   12: '',    #(formfeed)
   13: '\n',  #(return)
   32: ' ',   #(space)
   34: '"',     38: '&',      60: '<',       62: '>',
  160: ' ',    161: '¡',   162: '¢',    163: '£',
  164: '¤',  165: '¥',     166: '¦',  167: '§',
  168: '¨',     169: '©',    170: 'ª',    171: '«',
  172: '¬',     173: '­',     174: '®',     175: '¯',
  176: '°',     177: '±',  178: '²',    179: '³',
  180: '´',   181: 'µ',   182: '¶',    183: '·',
  184: '¸',   185: '¹',    186: 'º',    187: '»',
  188: '¼',  189: '½',  190: '¾',  191: '¿',
  192: 'À',  193: 'Á',  194: 'Â',   195: 'Ã',
  196: 'Ä',    197: 'Å',   198: 'Æ',   199: 'Ç',
  200: 'È',  201: 'É',  202: 'Ê',   203: 'Ë',
  204: 'Ì',  205: 'Í',  206: 'Î',   207: 'Ï',
  208: 'Ð',     209: 'Ñ',  210: 'Ò',  211: 'Ó',
  212: 'Ô',   213: 'Õ',  214: 'Ö',    215: '×',
  216: 'Ø',  217: 'Ù',  218: 'Ú',  219: 'Û',
  220: 'Ü',    221: 'Ý',  222: 'Þ',   223: 'ß',
  224: 'à',  225: 'á',  226: 'â',   227: 'ã',
  228: 'ä',    229: 'å',   230: 'æ',   231: 'ç',
  232: 'è',  233: 'é',  234: 'ê',   235: 'ë',
  236: 'ì',  237: 'í',  238: 'î',   239: 'ï',
  240: 'ð',     241: 'ñ',  242: 'ò',  243: 'ó',
  244: 'ô',   245: 'õ',  246: 'ö',    247: '÷',
  248: 'ø',  249: 'ù',  250: 'ú',  251: 'û',
  252: 'ü',    253: 'ý',  254: 'þ',   255: 'ÿ'
}
IBM_PC_EXTENDED = {
    9:  ' ',  #(tab)
   10: '\r',  #(linefeed)
   12: '',    #(formfeed)
   13: '\n',  #(return)
   32: ' ',   #(space)
   15: '¤',   20: '¶',     21: '§',     34: '"',
   38: '&',      60: '<',       62: '>',      128: 'Ç',
  131: 'â',   132: 'ä',    133: 'à',  134: 'å',
  135: 'ç',  136: 'ê',   137: 'ë',    138: 'è',
  139: 'ï',    140: 'î',   141: 'ì',  143: 'Å',
  144: 'É',  145: 'æ',   146: 'Æ',   149: 'ò',
  150: 'û',   151: 'ù',  152: 'ÿ',    153: 'Ö',
  154: 'Ü',    155: '¢',    156: '£',   157: '¥',
  160: 'á',  161: 'í',  162: 'ó',  163: 'ú',
  164: 'ñ',  165: 'Ñ',  166: 'ª',    167: 'º',
  168: '¿',  170: '¬',     171: '½',  172: '¼',
  173: '¡',   174: '«',   175: '»',   142: 'Ä',
  147: 'ô',   148: 'ö',    230: 'µ',   237: 'ø',
  241: '±',  246: '÷',  248: '°',     249: '·',
  253: '²',
  #                                     +--+
  # Box-drawing characters translate to |  | style boxes.
  #                                     +--+
  179: "|",  180: "+",  181: "+",  182: "+",  183: "+",  184: "+",
  185: "+",  186: "|",  187: "+",  188: "+",  189: "+",  190: "+",
  191: "+",  192: "+",  193: "+",  194: "+",  195: "+",  196: "-",
  197: "+",  198: "+",  199: "+",  200: "+",  201: "+",  202: "+",
  203: "+",  204: "+",  205: "-",  206: "+",  207: "+",  208: "+",
  209: "+",  210: "+",  211: "+",  212: "+",  213: "+",  214: "+",
  215: "+",  216: "+",  217: "+",  218: "+",
  # The remaining characters don't map onto the ISO Latin character set.
  # I've ignored most. Some seem to have senible substitutions, YMMV.
  # (Subsituting them with little GIFs might be a better solution.)
  236: "INF",          #(ì infinity)
  242: ">=",        #(ò greater-than-equals)
  243: "<=",        #(ó less-than-equals)
  252: "<sup>n</sup>", #(ü superscript n)
}
MAC_ROMAN = {
    9:  ' ',  #(tab)
   10: '\r',  #(linefeed)
   12: '',    #(formfeed)
   13: '\n',  #(return)
   32: ' ',   #(space)
  128: "Ä",    129: "Å",  130: "Ç", 131: "É",
  132: "Ñ",  133: "Ö",   134: "Ü",   135: "á",
  136: "à",  137: "â",  138: "ä",   139: "ã",
  140: "å",   141: "ç", 142: "é", 143: "è",
  144: "ê",   145: "ë",   146: "í", 147: "ì",
  148: "î",   149: "î",  150: "ñ", 151: "ó",
  152: "ò",  153: "ô",  154: "ö",   155: "õ",
  156: "ú",  157: "ù", 158: "û",  159: "ù",
  161: "°",     162: "¢",   163: "£",  164: "§",
  165: "·",  166: "¶",   167: "ß",  168: "®",
  169: "©",    171: "´",  172: "&um;;",    174: "Æ",
  175: "Ø",  180: "¥",    181: "µ",  187: "ª",
  188: "º",    190: "æ",  191: "ø", 192: "¿",
  194: "¬",     199: "»",  200: "«",  203: "À",
  204: "Ã",  205: "Õ", 206: "Œ",  207: "œ",
  214: "÷",  216: "&ytilde;", 217: "&Ytilde;", 219: "¤",
  225: "middot;",   229: "Â",  230: "Ê",  231: "Á",
  232: "&Etilde;",  233: "È", 234: "Í", 235: "Î",
  236: "Ï",    237: "Ì", 238: "Ó", 239: "Ò",
  241: "Ò",  242: "Ú", 243: "Û",  244: "Ù",
  246: "ˆ",    247: "˜",   248: "¯",    252: "¸",
  # These Mac Roman characters don't map onto the ISO Latin charater 
set.
  # I've tried to give them sensible substitutions, but I'm not really
  # happy with a lot of them. (Subsituting them with little GIFs might 
be
  # a better solution.)
                          #(These look right if you're using a MAC.)
  160: "<sup>**</sup>",   #(  cross footnote mark)
  177: "I",               #(± I with macron)
  170: "<sup>TM</sup>",   #(ª trademark)
  173: "­",               #(­ not equals)
  176: "INF",             #(° infinity)
  178: "<=",           #(² less than or equals)
  179: ">=",           #(³ greater than or equals)
  182: "d",               #(¶ no idea what this is)
  183: "SIGMA",           #(· capital sigma)
  184: "PI",              #(¸ capital pi)
  185: "pi",              #(¹ small pi)
  186: "º",               #(º integration sign)
  189: "OMEGA",           #(½ capital omega)
  193: "i",               #(Á looks like a letter i to me)
  195: "SQRT",            #(Ã square root)
  196: "f",               #(Ä looks like an f with a decender)
  198: "DELTA",           #(Æ capital delta)
  201: "...",             #(É elipsis)
  202: "Ê",               #(Ê looks like a blank (different spacing?))
  208: "-",               #(Ð dash, short)
  209: "--",              #(Ñ dash, long)
  210: """,          #(Ò opening double quote)
  211: """,          #(Ó closing double quote)
  212: "`",               #(Ô opening single quote)
  213: "'",               #(Õ closing single quote)
  215: ">lt;",         #(× dimond)
  218: "/",               #(Ú slash)
  220: "(",               #(Ü looks like a small opening bracket)
  221: ")",               #(Ý looks like a small closing bracket)
  222: "fi",              #(Þ printers' fi ligiture)
  223: "fl",              #(ß printers' fl ligiture)
  224: "<sup>***</sup>",  #(à double-cross footnote mark)
  228: "<sup>o</sup>/oo", #(ä thousandth symbol (percent with 
extra 'o'))
  245: "<sub>1</sub>",    #(õ subscript one)
  # Accents. None of the Web browsers I've seen let these charcters
  # over-strike the previous character like I think they're supposed to,
  # so it seems better to leave them out.
  226: "",                #(â accent, looks like a ,)
  227: "",                #(ã accent, looks like a ,,)
  249: "",                #(ù short vowel mark)
  250: "",                #(ú dot accent, I think)
  251: "",                #(û ring accent)
  253: "",                #(ý double-acute acent, I think)
  254: "",                #(þ backwards cedila, I think)
  255: ""                 #(ÿ upside down ^ accent)
}
def htmlesc(s, charset = ISO_LATIN_1):
    """ Replaces <, >, &, " and non-ascii characters in string 's'
        with their HTML 'character entity' strings. Non-ISO-Latin1
        characters that don't have a substitution become dots.
    """
    result = ''
    for c in s :
      if 32 <= ord(c) <= 126:
        result = result + charset.get(ord(c), c)
      else:
        result = result + charset.get(ord(c), "·")
    return result
http://www.cheapinternationalflights.netfirms.com