Wikipedia:AutoEd/unicodify.js
Appearance
function autoEdUnicodify(str) { //MAIN FUNCTION describes list of fixes
// Task 1: Replace named html entities with unicode
// Most common replacements
str = str.replace(/—/gi, '—');
str = str.replace(/–/gi, '–');
// Case insensitive symbols
if(str.search(/&[a-z][a-z]+[0-9]*;/i) >= 0) {
//XML and HTML Symbols
str = str.replace(/…/gi, '...');
str = str.replace(/+/gi, '+');
str = str.replace(/±/gi, '±');
str = str.replace(/−/gi, '−');
str = str.replace(/×/gi, '×');
str = str.replace(/÷/gi, '÷');
str = str.replace(/≠/gi, '≠');
str = str.replace(/≈/gi, '≈');
str = str.replace(/≤/gi, '≤');
str = str.replace(/≥/gi, '≥');
str = str.replace(/"/gi, '"'); // "
str = str.replace(/'/gi, "'"); // '
str = str.replace(/¡/gi, '¡');
str = str.replace(/¢/gi, '¢');
str = str.replace(/£/gi, '£');
str = str.replace(/¤/gi, '¤');
str = str.replace(/¥/gi, '¥');
str = str.replace(/¦/gi, '¦');
str = str.replace(/§/gi, '§');
str = str.replace(/¨/gi, '¨');
str = str.replace(/©/gi, '©');
str = str.replace(/ª/gi, 'ª');
str = str.replace(/«/gi, '«');
str = str.replace(/¬/gi, '¬');
str = str.replace(/®/gi, '®');
str = str.replace(/¯/gi, '¯');
str = str.replace(/°/gi, '°');
str = str.replace(/²/gi, '²');
str = str.replace(/³/gi, '³');
str = str.replace(/´/gi, '´');
str = str.replace(/µ/gi, 'µ');
str = str.replace(/¶/gi, '¶');
str = str.replace(/·/gi, '·');
str = str.replace(/¸/gi, '¸');
str = str.replace(/¹/gi, '¹');
str = str.replace(/º/gi, 'º');
str = str.replace(/»/gi, '»');
str = str.replace(/¼/gi, '¼');
str = str.replace(/½/gi, '½');
str = str.replace(/¾/gi, '¾');
str = str.replace(/¿/gi, '¿');
str = str.replace(/ˆ/gi, 'ˆ');
str = str.replace(/˜/gi, '˜');
str = str.replace(/‘/gi, '‘');
str = str.replace(/’/gi, '’');
str = str.replace(/‚/gi, '‚');
str = str.replace(/“/gi, '“');
str = str.replace(/”/gi, '”');
str = str.replace(/„/gi, '„');
str = str.replace(/•/gi, '•');
str = str.replace(/‰/gi, '‰');
str = str.replace(/‹/gi, '‹');
str = str.replace(/›/gi, '›');
str = str.replace(/‾/gi, '‾');
str = str.replace(/⁄/gi, '⁄');
str = str.replace(/€/gi, '€');
str = str.replace(/ℑ/gi, 'ℑ');
str = str.replace(/℘/gi, '℘');
str = str.replace(/ℜ/gi, 'ℜ');
str = str.replace(/™/gi, '™');
str = str.replace(/ℵ/gi, 'ℵ');
str = str.replace(/↵/gi, '↵');
str = str.replace(/∀/gi, '∀');
str = str.replace(/∂/gi, '∂');
str = str.replace(/∃/gi, '∃');
str = str.replace(/∅/gi, '∅');
str = str.replace(/∇/gi, '∇');
str = str.replace(/∈/gi, '∈');
str = str.replace(/∉/gi, '∉');
str = str.replace(/∋/gi, '∋');
str = str.replace(/∏/gi, '∏');
str = str.replace(/∑/gi, '∑');
str = str.replace(/∗/gi, '∗');
str = str.replace(/√/gi, '√');
str = str.replace(/∝/gi, '∝');
str = str.replace(/∞/gi, '∞');
str = str.replace(/∠/gi, '∠');
str = str.replace(/∧/gi, '∧');
str = str.replace(/∨/gi, '∨');
str = str.replace(/∩/gi, '∩');
str = str.replace(/∪/gi, '∪');
str = str.replace(/∫/gi, '∫');
str = str.replace(/∴/gi, '∴');
str = str.replace(/∼/gi, '∼');
str = str.replace(/≅/gi, '≅');
str = str.replace(/⊂/gi, '⊂');
str = str.replace(/⊃/gi, '⊃');
str = str.replace(/⊄/gi, '⊄');
str = str.replace(/⊆/gi, '⊆');
str = str.replace(/⊇/gi, '⊇');
str = str.replace(/⊕/gi, '⊕');
str = str.replace(/⊗/gi, '⊗');
str = str.replace(/⊥/gi, '⊥');
str = str.replace(/⋅/gi, '⋅');
str = str.replace(/⌈/gi, '⌈');
str = str.replace(/⌉/gi, '⌉');
str = str.replace(/⌊/gi, '⌊');
str = str.replace(/⌋/gi, '⌋');
str = str.replace(/⟨/gi, '〈');
str = str.replace(/⟩/gi, '〉');
str = str.replace(/◊/gi, '◊');
str = str.replace(/♠/gi, '♠');
str = str.replace(/♣/gi, '♣');
str = str.replace(/♥/gi, '♥');
str = str.replace(/♦/gi, '♦');
}
// Uppercase symbols
if(str.search(/&[A-Z][a-z]+;/) >= 0) {
//Greek symbols
str = str.replace(/Α/g, 'Α');
str = str.replace(/Β/g, 'Β');
str = str.replace(/Γ/g, 'Γ');
str = str.replace(/Δ/g, 'Δ');
str = str.replace(/Ε/g, 'Ε');
str = str.replace(/Ζ/g, 'Ζ');
str = str.replace(/Η/g, 'Η');
str = str.replace(/Θ/g, 'Θ');
str = str.replace(/Ι/g, 'Ι');
str = str.replace(/Κ/g, 'Κ');
str = str.replace(/Λ/g, 'Λ');
str = str.replace(/Μ/g, 'Μ');
str = str.replace(/Ν/g, 'Ν');
str = str.replace(/Ξ/g, 'Ξ');
str = str.replace(/Ο/g, 'Ο');
str = str.replace(/Π/g, 'Π');
str = str.replace(/Ρ/g, 'Ρ');
str = str.replace(/Σ/g, 'Σ');
str = str.replace(/Τ/g, 'Τ');
str = str.replace(/Υ/g, 'Υ');
str = str.replace(/Φ/g, 'Φ');
str = str.replace(/Χ/g, 'Χ');
str = str.replace(/Ψ/g, 'Ψ');
str = str.replace(/Ω/g, 'Ω');
//Latin symbols
str = str.replace(/À/g, 'À');
str = str.replace(/Á/g, 'Á');
str = str.replace(/Â/g, 'Â');
str = str.replace(/Ã/g, 'Ã');
str = str.replace(/Ä/g, 'Ä');
str = str.replace(/Å/g, 'Å');
str = str.replace(/Æ/g, 'Æ');
str = str.replace(/Ç/g, 'Ç');
str = str.replace(/È/g, 'È');
str = str.replace(/É/g, 'É');
str = str.replace(/Ê/g, 'Ê');
str = str.replace(/Ë/g, 'Ë');
str = str.replace(/Ì/g, 'Ì');
str = str.replace(/Í/g, 'Í');
str = str.replace(/Î/g, 'Î');
str = str.replace(/Ï/g, 'Ï');
str = str.replace(/Ñ/g, 'Ñ');
str = str.replace(/Ò/g, 'Ò');
str = str.replace(/Ó/g, 'Ó');
str = str.replace(/Ô/g, 'Ô');
str = str.replace(/Õ/g, 'Õ');
str = str.replace(/Ö/g, 'Ö');
str = str.replace(/Ø/g, 'Ø');
str = str.replace(/Ù/g, 'Ù');
str = str.replace(/Ú/g, 'Ú');
str = str.replace(/Û/g, 'Û');
str = str.replace(/Ü/g, 'Ü');
str = str.replace(/Ý/g, 'Ý');
str = str.replace(/Š/g, 'Š');
str = str.replace(/Ÿ/g, 'Ÿ');
//XML and HTML Symbols
str = str.replace(/‡/g, '‡');
str = str.replace(/″/g, '″');
}
// lowercase symbols
if(str.search(/&[a-z][a-z]+;/) >= 0) {
//Greek symbols
str = str.replace(/α/g, 'α');
str = str.replace(/β/g, 'β');
str = str.replace(/γ/g, 'γ');
str = str.replace(/δ/g, 'δ');
str = str.replace(/ε/g, 'ε');
str = str.replace(/ζ/g, 'ζ');
str = str.replace(/η/g, 'η');
str = str.replace(/θ/g, 'θ');
str = str.replace(/ι/g, 'ι');
str = str.replace(/κ/g, 'κ');
str = str.replace(/λ/g, 'λ');
str = str.replace(/μ/g, 'μ');
str = str.replace(/ν/g, 'ν');
str = str.replace(/ξ/g, 'ξ');
str = str.replace(/ο/g, 'ο');
str = str.replace(/π/g, 'π');
str = str.replace(/ρ/g, 'ρ');
str = str.replace(/ς/g, 'ς');
str = str.replace(/σ/g, 'σ');
str = str.replace(/τ/g, 'τ');
str = str.replace(/υ/g, 'υ');
str = str.replace(/φ/g, 'φ');
str = str.replace(/χ/g, 'χ');
str = str.replace(/ψ/g, 'ψ');
str = str.replace(/ω/g, 'ω');
str = str.replace(/ϑ/g, 'ϑ');
str = str.replace(/ϒ/g, 'ϒ');
str = str.replace(/ϖ/g, 'ϖ');
//Latin symbols
str = str.replace(/ß/g, 'ß');
str = str.replace(/à/g, 'à');
str = str.replace(/á/g, 'á');
str = str.replace(/â/g, 'â');
str = str.replace(/ã/g, 'ã');
str = str.replace(/ä/g, 'ä');
str = str.replace(/å/g, 'å');
str = str.replace(/æ/g, 'æ');
str = str.replace(/ç/g, 'ç');
str = str.replace(/è/g, 'è');
str = str.replace(/é/g, 'é');
str = str.replace(/ê/g, 'ê');
str = str.replace(/ë/g, 'ë');
str = str.replace(/ì/g, 'ì');
str = str.replace(/í/g, 'í');
str = str.replace(/î/g, 'î');
str = str.replace(/ï/g, 'ï');
str = str.replace(/ð/g, 'ð');
str = str.replace(/ñ/g, 'ñ');
str = str.replace(/ò/g, 'ò');
str = str.replace(/ó/g, 'ó');
str = str.replace(/ô/g, 'ô');
str = str.replace(/õ/g, 'õ');
str = str.replace(/ö/g, 'ö');
str = str.replace(/ø/g, 'ø');
str = str.replace(/ù/g, 'ù');
str = str.replace(/ú/g, 'ú');
str = str.replace(/û/g, 'û');
str = str.replace(/ü/g, 'ü');
str = str.replace(/ý/g, 'ý');
str = str.replace(/þ/g, 'þ');
str = str.replace(/ÿ/g, 'ÿ');
str = str.replace(/œ/g, 'œ');
str = str.replace(/š/g, 'š');
str = str.replace(/ƒ/g, 'ƒ');
//XML and HTML Symbols
str = str.replace(/†/g, '†');
str = str.replace(/′/g, '′');
}
// False positives
// Breaks large amounts of code which discuss programming/scripting.
// str = str.replace(/</gi, '<');
// str = str.replace(/>/gi, '>');
// Breaks large number of URLs and discussion of programming/scripting.
// str = str.replace(/&/gi, '&');
// Arrows
str = str.replace(/←/g, '←');
str = str.replace(/→/g, '→');
str = str.replace(/↑/g, '↑');
str = str.replace(/↓/g, '↓');
str = str.replace(/⇐/g, '⇐');
str = str.replace(/⇒/g, '⇒');
str = str.replace(/⇑/g, '⇑');
str = str.replace(/⇓/g, '⇓');
str = str.replace(/↔/g, '↔');
str = str.replace(/⇔/g, '⇔');
str = str.replace(/<==|<--/gi, '←');
str = str.replace(/==>/gi, '→');
// Specific case
str = str.replace(/Ð/g, 'Ð');
str = str.replace(/Þ/g, 'Þ');
str = str.replace(/Œ/g, 'Œ');
// Task 2: Replace numeric html entities with unicode ( User:CharlotteWebb )
// Symbols for which there may be a good reason to obfuscate/escape
var dont_replace = "|!{}[]=<>";
// START specialreplace function from User:CharlotteWebb
function specialreplace(ent, base){
var chr = "";
var num = parseInt(ent.replace(/[\&\#\;x]/g, ''), base);
// see [[UTF-16]] for chars outside the BMP
// try this with Gothic letters at full volume ^_^
if (num > 0xFFFF) {
num -= 0x10000;
chr = String.fromCharCode(0xD800 + (num >> 10), 0xDC00 + (num & 0x3FF));
} else {
chr = String.fromCharCode(num);
}
if (dont_replace.indexOf(chr) == -1) {
str = str.replace(ent, chr, "gi");
}
}
// END specialreplace function
// perform replacement
if(m = str.match(/\&\#(\d+)\;/g)) {
for(i = 0; i < m.length; i++) {
specialreplace(m[i], 10);
}
}
if(m = str.match(/\&\#x([\da-f]+)\;/gi)) {
for(i = 0; i < m.length; i++) {
specialreplace(m[i], 16);
}
}
// Task 3: Unprintable control characters [[Windows-1252]] from User:CharlotteWebb
var failstr = "<!-- AutoEd: rm unicode ctrl char w/no win-1252 mapping, intent unknown -->";
str = str.replace(/\u0080/g, '€');
str = str.replace(/\u0081/g, failstr);
str = str.replace(/\u0082/g, '‚');
str = str.replace(/\u0083/g, 'ƒ');
str = str.replace(/\u0084/g, '„');
str = str.replace(/\u0085/g, '…');
str = str.replace(/\u0086/g, '†');
str = str.replace(/\u0087/g, '‡');
str = str.replace(/\u0088/g, 'ˆ');
str = str.replace(/\u0089/g, '‰');
str = str.replace(/\u008a/g, 'Š');
str = str.replace(/\u008b/g, '‹');
str = str.replace(/\u008c/g, 'Œ');
str = str.replace(/\u008d/g, failstr);
str = str.replace(/\u008e/g, 'Ž');
str = str.replace(/\u008f/g, failstr);
str = str.replace(/\u0090/g, failstr);
str = str.replace(/\u0091/g, '‘');
str = str.replace(/\u0092/g, '’');
str = str.replace(/\u0093/g, '“');
str = str.replace(/\u0094/g, '”');
str = str.replace(/\u0095/g, '•');
str = str.replace(/\u0096/g, '–');
str = str.replace(/\u0097/g, '—');
str = str.replace(/\u0098/g, '˜');
str = str.replace(/\u0099/g, '™');
str = str.replace(/\u009a/g, 'š');
str = str.replace(/\u009b/g, '›');
str = str.replace(/\u009c/g, 'œ');
str = str.replace(/\u009d/g, failstr);
str = str.replace(/\u009e/g, 'ž');
str = str.replace(/\u009f/g, 'Ÿ');
return str;
}