Title: 
  [ANeff] ER for: canonicalizeURL(inputString, restrictMultiple, restrictMixed[, throwOnError=false])
| View in TrackerStatus/Resolution/Reason: To Fix//
Reporter/Name(from Bugbase): Aaron Neff / Aaron Neff (Aaron Neff)
Created: 01/21/2015
Components: Language
Versions: 11.0
Failure Type: Enhancement Request
Found In Build/Fixed In Build: CF11_Final /
Priority/Frequency: Trivial / Unknown
Locale/System: English / Win All
Vote Count: 3
canonicalize(myURL) is broken b/c it incorrectly interprets some query string parameters as character entities and converts them to symbols.
Example:
writeOutput(canonicalize("http://www.domain.com/?foo=bar&pid=product_id", true, true, false)
returns: http://www.domain.com/?foo=bar?d=product_id  (note the ampersand is gone and there's a Pi symbol between 'r' and 'd')
Thus, URLs are a special case and a URL-specific canonicalizeURL() function is needed that takes same parameters as canonicalize(). Example:
<cfscript>
  // Canonicalizes a URL b/c canonicalize() converts, for example, &pi to the Pi symbol in the query string ?foo=bar&pid=product_id
  function udfCanonicalizeURL(required string inputString, required boolean restrictMultiple, required boolean restrictMixed, boolean throwOnError=false) {
	  var canonicalizedURL="";
	  ARGUMENTS.inputString = trim(ARGUMENTS.inputString);
	  if(isValid("url", ARGUMENTS.inputString)) {//note: has a bug per #3924581
		  var pattern = "([^?##]*)?(\?([^##]*))?(##(.*))?";//parses the URL into schemeHostPath, querystring and fragment
		  var parsedURL = reFind(pattern, ARGUMENTS.inputString, 1, true);
		  if(parsedURL.len[2]) {//2=schemeHostPath 
			  canonicalizedURL &= canonicalize(mid(ARGUMENTS.inputString, parsedURL.pos[2], parsedURL.len[2]), ARGUMENTS.restrictMultiple, ARGUMENTS.restrictMixed, ARGUMENTS.throwOnError);
			  if(parsedURL.len[4]) {//4=querystring
				  var qs = mid(ARGUMENTS.inputString, parsedURL.pos[4], parsedURL.len[4]);
				  var canonicalizedQS="";
				  var qsPairs = reMatch("[\&;]?[^\&;]+", qs);
				  for(var qsPair in qsPairs) {
					  var qsPairNoDelim = listLast(qsPair, "&;");
					  canonicalizedQS &= ((reFind("^[\&;].*", qsPair)?left(qsPair, 1):'') & canonicalize(listFirst(qsPairNoDelim, "="), ARGUMENTS.restrictMultiple, ARGUMENTS.restrictMixed, ARGUMENTS.throwOnError));
					  var qsValueStartPos = find("=", qsPairNoDelim);
					  if(qsValueStartPos and (len(qsPairNoDelim) gt qsValueStartPos)) {
						canonicalizedQS &= ('=' & canonicalize(right(qsPairNoDelim, len(qsPairNoDelim) - qsValueStartPos), ARGUMENTS.restrictMultiple, ARGUMENTS.restrictMixed, ARGUMENTS.throwOnError));
					  }
				  }
				  if(len(canonicalizedQS)) {
					  canonicalizedURL &= ('?' & canonicalizedQS);
				  }
			  }
			  if(parsedURL.len[6]) {//6=fragment
				  canonicalizedURL &= ("##" & canonicalize(mid(ARGUMENTS.inputString, parsedURL.pos[6], parsedURL.len[6]), ARGUMENTS.restrictMultiple, ARGUMENTS.restrictMixed, ARGUMENTS.throwOnError));
			  }
		  }
	  } else if(throwOnError) {
		  throw(message = "URL is not valid");
	  }
	  return canonicalizedURL;
  }
  theURL = "http://www.domain.com/?foo=bar&pid=product_id";
  writeOutput(canonicalize(theURL, true, true, false) & '<br> ' & udfCanonicalizeURL(theURL, true, true, false));
</cfscript>
----------------------------- Additional Watson Details -----------------------------
Watson Bug ID:	3924625
Reason:	BugVerified
External Customer Info:
External Company:  
External Customer Name: Aaron
External Customer Email:
  Attachments:
- January 21, 2015 00:00:00: 1_3924625.cfm
 
Comments: