Difference between revisions of "MediaWiki:TextCleaner.js"
Jump to navigation
Jump to search
(cp from WikiCommons; version 14:08, 10 January 2015 Lupo) |
(No difference)
|
Latest revision as of 05:01, 5 April 2015
// <source lang="javascript"> /* Wikitext sanitation for MediaWiki Author: [[User:Lupo]], January 2008 License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0) Choose whichever license of these you like best :-) */ /*jshint curly:false, eqnull:true, laxbreak:true */ var TextCleaner = { imgNamespaceNames : null, // This function attempts to construct well-formed wikitext from input that may contain // possibly broken wikitext. // // Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence // of templates, and due to the fact that image thumbnail captions may themselves contain // links. This implementation catches the most common errors (such as forgetting to close a // template or a link), and even some more elaborate ones. With enough malice, this sanitation // can still be broken by user input such that the result is not well-formed wikitext as the // parser at the servers would like to have it. (It's still possible that the result is broken // wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext // into broken wikitext.) // // If 'only_thumbs' is true, all [[Image: links are changed to [[:Image:, unless the original // image link was a thumbnail or had a width smaller than 300px specified. // // WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is // probably rather inefficient due to the many substrings that are generated. This function is // primarily intended to be used to clean up user input in forms, which are typically rather // short. sanitizeWikiText : function (input, only_thumbs) { if (input.search(/[\][}{]|<nowiki(\s[^>]*)?>|<\!--/) < 0) return input; // No critical characters if (!TextCleaner.imgNamespaceNames) { TextCleaner.imgNamespaceNames = []; var namespaceIds = mw.config.get('wgNamespaceIds'); if (namespaceIds) { for (var name in namespaceIds) { if (namespaceIds[name] == 6) // Image namespace TextCleaner.imgNamespaceNames[TextCleaner.imgNamespaceNames.length] = name; } } // Make sure that we have the two canonical names TextCleaner.imgNamespaceNames[TextCleaner.imgNamespaceNames.length] = 'Image'; TextCleaner.imgNamespaceNames[TextCleaner.imgNamespaceNames.length] = 'File'; // If your Wiki does not have wgNamespaceIds, add aliases or localized namespace names here! } var consumed = [0, 0]; // For image captions. Image caption may contain links, and may even contain images. // The current MediaWiki parser actually allows this only once. For deeper recursions, // it fails. But here, it's actually easier to implement no limit. var base_regexp = new RegExp ( "[\\x01\\x02\\x03\\x04[\\]\\|\\x05\\x06\\x07\\x08]" + "|\<nowiki(\\s[^>]*)?\>|\<\!--", "i"); // Ignore case var nowiki_regexp = new RegExp("\<\\/nowiki(\\s[^>]*)?\>|\<\!--", "i"); var allow_only_thumbs = only_thumbs; function sanitize (s, with_links, caption_level, allow_thumbs, break_at_pipe, with_tables, with_galleries) { if (!s || s.length === 0) { if (caption_level > 0) { if (consumed.length < caption_level) consumed.push(0); else consumed[caption_level-1] = 0; } return s; } var result = ""; var initial_length = s.length; var get_out = false; var in_nowiki = false; var endings = null; // Stack recording template and table nesting var next; function push_end (val) { if (endings == null) { endings = new Array (1); endings[0] = val; } else { endings[endings.length] = val; } } function pop_end () { if (endings == null) return null; // Shouldn't happen var result; if (endings.length == 1) { result = endings[0]; endings = null; } else { result = endings[endings.length -1]; endings.length = endings.length - 1; } return result; } regexp = base_regexp; while (s.length > 0 && !get_out) { next = s.search(regexp); if (next < 0) { result = result + s; break; } var ch = s.charAt(next); var i = -1; var j = -1; var k = -1; switch (ch) { case '<': // Nowiki or HTML comment. Must be closed. if (s.charAt(next+1) == '!') { // HTML comment. Cannot be nested. i = s.indexOf('--\>', next + 3); if (i < 0) { result = result + s + '--\>'; s = ""; } else { result = result + s.substring(0, i + 3); s = s.substring(i + 3); } } else if (s.charAt(next + 1) == 'n') { // Nowiki may contain HTML comments! in_nowiki = true; regexp = nowiki_regexp; result = result + s.substring(0, next + 7); s = s.substring(next + 7); } else { // End of nowiki. Searched for and found only if in_nowiki == true in_nowiki = false; regexp = base_regexp; i = s.indexOf('>', next+1); // End of tag result = result + s.substring(0, i+1); s = s.substring(i+1); } break; case '\x05': // Table start if (!with_tables) { result = result + s.substring(0, next); get_out = true; break; } // Fall through case '\x07': if (ch == '\x07' && !with_galleries) { result = result + s.substring(0, next); get_out = true; break; } case '\x01': // Start of template, table, or gallery result = result + s.substring(0, next+1); push_end (String.fromCharCode(ch.charCodeAt(0)+1).charAt(0)); s = s.substring(next+1); break; case '\x06': // Table end if (break_at_pipe && endings == null) { result = result + s.substring(0, next); get_out = true; break; } // Fall through case '\x02': // End of a template or table result = result + s.substring(0, next); if (endings == null || endings[endings.length - 1] != ch) { // Spurious template or table end if (ch == '\x02') result = result + '}}'; else result = result + '|}'; } else { result = result + pop_end (); } s = s.substring(next+1); break; case '\x08': // End of gallery result = result + s.substring(0, next+1); if (endings != null && endings[endings.length - 1] == ch) pop_end (); s = s.substring(next+1); break; case '\x03': case '[': { if (!with_links && endings == null) { get_out = true; break; } // Image links must be treated specially, since they may contain nested links // in the caption! var initial = null; // If set, it's 'image:' or 'file:' and we have an image link i = next; while (i < s.length && s.charAt(i) == ch) i++; if (ch == '\x03' && i < s.length && s.charAt(i) == '[') i++; function get_initial(i, s) { for (var j = 0; j < TextCleaner.imgNamespaceNames.length; j++) { if (s.length >= i + TextCleaner.imgNamespaceNames[j].length + 1) { var t = s.substr(i, TextCleaner.imgNamespaceNames[j].length + 1); if (t.toLowerCase() == (TextCleaner.imgNamespaceNames[j].toLowerCase() + ':')) return t; } } return null; } initial = get_initial (i, s); // Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |} var lk_text = sanitize (s.substring(i), false, // No links at top-level allowed caption_level + 1, false, // No thumbs true, // Break at pipe false, // No tables false); // No galleries var lk_text_length = consumed[caption_level]; j = i + lk_text_length; if (j >= s.length) { // Used up the whole text: [[Foo or [bar if (initial != null && allow_only_thumbs) // Should in any case have started with [[, not [ result = result + s.substring(0, i-1) + '\x03:' + initial + lk_text.substring(initial.length) + '\x04'; else result = result + s.substring(0, i) + lk_text + ((s.charAt(i-1) == '[') ? ']' : '\x04'); s = ""; break; } if (s.charAt(j) == '|') k = j; else k = -1; if (k < 0) { // No pipe found: we should be on the closing ]] or ] or [[Foo]] or [bar] if (initial != null && allow_only_thumbs) // Should in any case have started with [[, not [ result = result + s.substring(0, i-1) + '\x03:' + initial + lk_text.substring(initial.length) + '\x04'; else result = result + s.substring(0, i) + lk_text + ((s.charAt(i-1) == '[') ? ']' : '\x04'); if (s.charAt(j) == ']' || s.charAt(j) == '\x04') { // Indeed closing the link s = s.substring(j+1); } else { s = s.substring(j); } break; } else { var caption = null; var used = 0; // Pipe found. if (initial == null) { // Not an image link. Must be something like [[Foo|Bar]]. caption = sanitize( s.substring(k + 1), // No links, please false, caption_level + 1, // No thumbs either false, // Don't care about pipes false, // Allow tables (yes, parser allows that!) true, // Allow galleries (?) true ); // Now we're at [[, [, ]], or ] used = consumed[caption_level]; result = result + s.substring(0, i) + lk_text + '|' + caption + ((s.charAt(i-1) == '[') ? ']' : '\x04'); } else { var q = s.substring(k); // We assume that there are no templates, nowikis, and other nasty things // in the parameters. Search forward until the next [, {, ], } l = q.search(/[\x01\x02\x03[\x04\]\{\}\x05\x06\x07\x08]/); if (l < 0) l = q.length; if (l+1 < q.length) q = q.substring(0, l+1); var is_thumb = q.search(/\|\s*thumb(nail)?\s*[\|\x04]/) >= 0; var img_width = /\|\s*(\d+)px\s*[\|\x04]/.exec(q); if (img_width && img_width.length > 1) { img_width = parseInt (img_width[1], 10); if (isNaN (img_width)) img_width = null; } else img_width = null; if (img_width === null) img_width = is_thumb ? 180 : 301; var is_small = img_width <= 300; // Caption starts at the last pipe before l. If that is a parameter, // it doesn't hurt. var m = k + q.lastIndexOf('|', l); caption = sanitize( s.substring(m+1), // Allow links only if it's a thumb is_thumb, caption_level + 1, allow_thumbs && is_thumb, // Don't break at pipe false, // Tables only if it's a thumb is_thumb, // Allow galleries for thumbs (?) is_thumb ); used = consumed[caption_level]; // caption used 'used' chars from m+1, s.charAt(m+1+used) == '\x04' is_thumb = allow_thumbs && is_small; if (is_thumb || !allow_only_thumbs) result = result + s.substring(0, i-1) + '\x03' + lk_text ; else result = result + s.substring(0, i-1) + '\x03:' + initial + lk_text.substring(initial.length); result = result + s.substring(k, m+1) + caption + '\x04'; k = m; } next = k+1+used; if (next < s.length) { if (s.charAt(next) != '\x04') s = s.substring(next); else s = s.substring(next+1); } else s = ""; } break; } case '\x04': case ']': // Extra bracket. result = result + s.substring(0, next); if (caption_level === 0 && !break_at_pipe) { result = result + (ch == ']' ? ']' : ']]'); s = s.substring(next+1); } else get_out = true; break; case '|': result = result + s.substring(0, next); if (break_at_pipe && endings == null) { // Pipe character at top level get_out = true; } else { if (caption_level === 0 && !break_at_pipe && endings == null) result = result + '|'; // Top-level pipe character else result = result + '|'; s = s.substring(next+1); } break; } // end switch } // end while if (in_nowiki) result = result + "\<\/nowiki>"; // Make sure this nowiki is closed. // Close open templates and tables while (endings != null) { ch = pop_end(); result = result + (ch == '\x06' ? '\n' : "") + ch; } if (caption_level > 0) { var used_up = initial_length - (get_out ? (s.length - next) : 0); if (consumed.length < caption_level) consumed[consumed.length] = used_up; else consumed[caption_level-1] = used_up; } return result; } // Replace multi-character tokens by one-character placeholders, simplifying the // subsequent processing. var s = input.replace(/\{\{/g, '\x01') .replace(/\n\s*\|\}\}\}/g, '\n\x06\x02') // Table end + template end .replace(/\}\}/g, '\x02') .replace(/\[\[/g, '\x03') .replace(/\]\]/g, '\x04') .replace(/\n\s*\{\|/g, '\n\x05') // Table start and end must be on own line .replace(/^\s*\{\|/, '\x05') // Table start at the very beginning .replace(/\n\s*\|\}/g, '\n\x06') // (we strip leading whitespace) .replace(/\<\s*gallery\s*\>/g, '\x07') .replace(/\<\/\s*gallery\s*\>/g, '\x08'); s = sanitize(s, true, 0, true, false, true, true); // with links, allow thumbs, don't break at pipe, allow tables, allow galleries return s.replace(/\x01/g, '\{\{') .replace(/\x02/g, '\}\}') .replace(/\x03/g, '\[\[') .replace(/\x04/g, '\]\]') .replace(/\x05/g, '\{\|') .replace(/\x06/g, '\|\}') .replace(/\x07/g, '<gallery>') .replace(/\x08/g, '</gallery>'); } }; // </source>