Format String As Uri Component

Audience Level

Beginner and above.

Introduction

A JScript / Javascript / ECMAScript function that takes a string and returns a human-readable version of it that is safe for use in URIs. Particularly useful in creating HTML filenames from the document <title> text. It converts international/accented characters into ASCII equivalents to make URIs readable still rather than stripping out the original characters completely. Of course, if you don't need to worry about that then just use the ordinary “Server.URLEncode()” ASP method (or equivalent) instead. Some examples of input and corresponding outputs is shown below:

  • Monty Python: The Movies (Box Set)monty_python_the_movies_box_set
  • Apollo 13 [1995]apollo_13_1995
  • Through Gates of Fire: A Journey into World Disorderthrough_gates_of_fire_a_journey_into (capped at 40 chars)
  • <b>Shawn Colvin</b> “<i>Whole New You</i>”shawn_colvin_whole_new_you
  • Random chars: äöüÄÖÜàùòìèéçñrandom_chars_aouaouauoieecn

This is a computationally expensive function to invoke and is best used once for a given string and the result stored in a database or other data store for future use.

Source Code

/*
Function: formatStringAsUriComponent()
Description: Returns a string that is safe for use in URIs - characters invalid in URIs
             have been stripped. Also attempts to make the URI more readable.
Returns: String
History:
20050510 1148BST    v2      Andrew Urquhart     Created
*/

function formatStringAsUriComponent(strText, intMaxLen) {
    try {
        if (!strText) {
            throw new Error(1, "Required parameter \"strText\" was not defined");
        }

        var s = strText;

        // REPLACE HTML WITH WHITESPACE
        s = s.replace(/<\/?[^>]*>/gm, " ");

        // MAKE CERTAIN SYMBOL-TO-WORD REPLACEMENTS
        s = s.replace(/&amp;\s+|&\s+/g, "and ")
        s = s.replace(/\s*@\s*/g, " at ");
        s = s.replace(/\s*%\s*/g, " percent ");
        s = s.replace(/\s*£\s*/g, " pound ");
        s = s.replace(/\s*\$\s*/g, " dollar ");
        s = s.replace(/\s*>\s*/g, " greater than ");
        s = s.replace(/\s*<\s*/g, " less than ");
        s = s.replace(/\s*=\s*/g, " equals ");
        s = s.replace(/\s*\+\s*/g, " plus ");

        // REMOVE ENTITIES
        s = s.replace(/&[\w]+;/g, "");

        // REMOVE 'PUNCTUATION'
        s = s.replace(/\./g, " ");
        s = s.replace(/[\u0000-\u0019\u0021-\u002F\u003a-\u003f\u005b-\u0060\u007b-\u007f\u00a1-\u00bf\u02c6-\u0385\u2018\u0060]/g, "");

        // SUBSTITUTE CERTAIN UNICODE CHARS FOR ASCII ALTERNATIVES
        s = s.replace(/[\u00c0-\u00c6\u00e0-\u00e6\u0100-\u0105\u01fa-\u01fb]/g, "a");
        s = s.replace(/[\u00c8-\u00cb\u00e8-\u00eb\u0112-\u011b\u018f\u0259]/g, "e");
        s = s.replace(/[\u00cc-\u00cf\u00ec-\u00ef\u0128-\u0131]/g, "i");
        s = s.replace(/[\u00d2-\u00d6\u00d8\u00f0\u00f2-\u00f6\u00f8\u014c-\u0151\u01a0-\u01a1\u01d1-\u01d2\u01fe-\u01ff]/g, "o");
        s = s.replace(/[\u00d9-\u00dc\u00f9-\u00fc\u0168-\u0173\u01d3-\u01dc]/g, "u");
        s = s.replace(/[\u00c6-\u00e6\u01fc-\u01fd]/g, "ae");
        s = s.replace(/[\u00e7\u0106-\u010d]/g, "c");
        s = s.replace(/[\u010e-\u0111\u00d0]/g, "d");
        s = s.replace(/[\u011c-\u0123]/g, "g");
        s = s.replace(/[\u0124-\u0127]/g, "h");
        s = s.replace(/[\u0132-\u0133]/g, "ij");
        s = s.replace(/[\u0134-\u0135]/g, "j");
        s = s.replace(/[\u0136-\u0138]/g, "k");
        s = s.replace(/[\u0139-\u0142]/g, "l");
        s = s.replace(/[\u0143-\u014b\u00f1\u00d1]/g, "n");
        s = s.replace(/[\u0152-\u0153]/g, "oe");
        s = s.replace(/[\u0154-\u0159]/g, "r");
        s = s.replace(/[\u015a-\u0161\u00df]/g, "s");
        s = s.replace(/[\u0162-\u0167]/g, "t");
        s = s.replace(/[\u0174-\u0175]/g, "w");
        s = s.replace(/[\u0176-\u0178\u00fd\u00ff\u00dd]/g, "y");
        s = s.replace(/[\u0179-\u017e]/g, "z");

        // REPLACE ANYTHING NOT IN CHARACTER SET WITH WHITESPACE
        s = s.replace(/[^\w-]+/g, " ");

        // REPLACE MULTIPLE WHITESPACE WITH SINGLE WHITESPACE
        s = s.replace(/\s{2,}/g, " ");

        // TRUNCATE TITLE AT WORD-BOUNDARY AT OR JUST BEFORE intMaxLen CHARS
        if (intMaxLen && s.length > intMaxLen) {
            s = s.substring(0, intMaxLen);
            s = s.replace(/(.*)\b.+?$/, "$1");
        }

        // TRIM WHITESPACE AT START AND END OF TITLE
        s = s.replace(/^\s+|\s+$/g, "");

        // REPLACE WHITESPACE WITH _
        s = s.replace(/\s/g, "_");

        return s.toLowerCase();
    }
    catch (err) {
        throw new Error(err.number, "Function formatStringAsUriComponent() failed with parameters strText=\"" + strText + "\", intMaxLen=\"" + intMaxLen + "\". Message=\r\n" + err.description);
    }
}

Download

Download the source directly.

Bugs

UTF-8 Unicode is the intended character encoding for this function, i.e. under ASP the “CodePage” pre-processing parameter is intended to be “65001”. Not all accented characters will be translated into plain ASCII equivalents, and some may be incorrectly converted due to oversights on my part. If you find an incorrect conversion please add a comment saying which character is being incorrectly converted, what it is currently being converted to and what you think it should be instead.

Advertisement

Feedback

Voting Panel
Is this useful?
or
Did you find any bugs?
or
Did it solve your programming problem?
or
Rate this script: (0=poor, 5=very good)
Answers are anonymous, only the combined totals are stored. Uses cookies.