Format String As Uri Component
Audience Level
Beginner and above.
Introduction
A JScript / Javascript / ECMAScript function that takes a string and
returns a human-readable version of it that is safe for use in URIs. Particularly useful in creating HTML
filenames from the document <title> text. It converts international/accented characters
into ASCII equivalents to make URIs readable still rather than stripping out the original characters
completely. Of course, if you don't need to worry about that then just use the ordinary “Server.URLEncode()”
ASP method (or equivalent) instead. Some examples of input and corresponding outputs is shown below:
- Monty Python: The Movies (Box Set) → monty_python_the_movies_box_set
- Apollo 13 [1995] → apollo_13_1995
- Through Gates of Fire: A Journey into World Disorder → through_gates_of_fire_a_journey_into (capped at 40 chars)
- <b>Shawn Colvin</b> “<i>Whole New You</i>” → shawn_colvin_whole_new_you
- Random chars: äöüÄÖÜàùòìèéçñ → random_chars_aouaouauoieecn
This is a computationally expensive function to invoke and is best used once for a given string and the result stored in a database or other data store for future use.
Source Code
Function: formatStringAsUriComponent()
Description: Returns a string that is safe for use in URIs - characters invalid in URIs
have been stripped. Also attempts to make the URI more readable.
Returns: String
History:
20050510 1148BST v2 Andrew Urquhart Created
*/
function formatStringAsUriComponent(strText, intMaxLen) {
try {
if (!strText) {
throw new Error(1, "Required parameter \"strText\" was not defined");
}
var s = strText;
// REPLACE HTML WITH WHITESPACE
s = s.replace(/<\/?[^>]*>/gm, " ");
// MAKE CERTAIN SYMBOL-TO-WORD REPLACEMENTS
s = s.replace(/&\s+|&\s+/g, "and ")
s = s.replace(/\s*@\s*/g, " at ");
s = s.replace(/\s*%\s*/g, " percent ");
s = s.replace(/\s*£\s*/g, " pound ");
s = s.replace(/\s*\$\s*/g, " dollar ");
s = s.replace(/\s*>\s*/g, " greater than ");
s = s.replace(/\s*<\s*/g, " less than ");
s = s.replace(/\s*=\s*/g, " equals ");
s = s.replace(/\s*\+\s*/g, " plus ");
// REMOVE ENTITIES
s = s.replace(/&[\w]+;/g, "");
// REMOVE 'PUNCTUATION'
s = s.replace(/\./g, " ");
s = s.replace(/[\u0000-\u0019\u0021-\u002F\u003a-\u003f\u005b-\u0060\u007b-\u007f\u00a1-\u00bf\u02c6-\u0385\u2018\u0060]/g, "");
// SUBSTITUTE CERTAIN UNICODE CHARS FOR ASCII ALTERNATIVES
s = s.replace(/[\u00c0-\u00c6\u00e0-\u00e6\u0100-\u0105\u01fa-\u01fb]/g, "a");
s = s.replace(/[\u00c8-\u00cb\u00e8-\u00eb\u0112-\u011b\u018f\u0259]/g, "e");
s = s.replace(/[\u00cc-\u00cf\u00ec-\u00ef\u0128-\u0131]/g, "i");
s = s.replace(/[\u00d2-\u00d6\u00d8\u00f0\u00f2-\u00f6\u00f8\u014c-\u0151\u01a0-\u01a1\u01d1-\u01d2\u01fe-\u01ff]/g, "o");
s = s.replace(/[\u00d9-\u00dc\u00f9-\u00fc\u0168-\u0173\u01d3-\u01dc]/g, "u");
s = s.replace(/[\u00c6-\u00e6\u01fc-\u01fd]/g, "ae");
s = s.replace(/[\u00e7\u0106-\u010d]/g, "c");
s = s.replace(/[\u010e-\u0111\u00d0]/g, "d");
s = s.replace(/[\u011c-\u0123]/g, "g");
s = s.replace(/[\u0124-\u0127]/g, "h");
s = s.replace(/[\u0132-\u0133]/g, "ij");
s = s.replace(/[\u0134-\u0135]/g, "j");
s = s.replace(/[\u0136-\u0138]/g, "k");
s = s.replace(/[\u0139-\u0142]/g, "l");
s = s.replace(/[\u0143-\u014b\u00f1\u00d1]/g, "n");
s = s.replace(/[\u0152-\u0153]/g, "oe");
s = s.replace(/[\u0154-\u0159]/g, "r");
s = s.replace(/[\u015a-\u0161\u00df]/g, "s");
s = s.replace(/[\u0162-\u0167]/g, "t");
s = s.replace(/[\u0174-\u0175]/g, "w");
s = s.replace(/[\u0176-\u0178\u00fd\u00ff\u00dd]/g, "y");
s = s.replace(/[\u0179-\u017e]/g, "z");
// REPLACE ANYTHING NOT IN CHARACTER SET WITH WHITESPACE
s = s.replace(/[^\w-]+/g, " ");
// REPLACE MULTIPLE WHITESPACE WITH SINGLE WHITESPACE
s = s.replace(/\s{2,}/g, " ");
// TRUNCATE TITLE AT WORD-BOUNDARY AT OR JUST BEFORE intMaxLen CHARS
if (intMaxLen && s.length > intMaxLen) {
s = s.substring(0, intMaxLen);
s = s.replace(/(.*)\b.+?$/, "$1");
}
// TRIM WHITESPACE AT START AND END OF TITLE
s = s.replace(/^\s+|\s+$/g, "");
// REPLACE WHITESPACE WITH _
s = s.replace(/\s/g, "_");
return s.toLowerCase();
}
catch (err) {
throw new Error(err.number, "Function formatStringAsUriComponent() failed with parameters strText=\"" + strText + "\", intMaxLen=\"" + intMaxLen + "\". Message=\r\n" + err.description);
}
}
Download
Bugs
UTF-8 Unicode is the intended character encoding for this function, i.e. under ASP the “CodePage”
pre-processing parameter is intended to be “65001”. Not all accented characters will be translated into plain ASCII
equivalents, and some may be incorrectly converted due to oversights on my part. If you find an incorrect
conversion please add a comment saying which character is being incorrectly converted, what it is currently
being converted to and what you think it should be instead.