That's what was so WTFy to me, there is a php function to do most of this; parse_url().
<?php
$url = 'http://username:password@hostname:8080/path?arg=value#anchor';
print_r(parse_url($url));
?>
That's what was so WTFy to me, there is a php function to do most of this; parse_url().
<?php
$url = 'http://username:password@hostname:8080/path?arg=value#anchor';
print_r(parse_url($url));
?>
One of the long gone employees at my company did it like this. I especially like the $options voodoo and the hardcoded Top Level Domain :)
<?php
function validateUrlSyntax( $urladdr, $options="" ){// Force Options parameter to be lower case // DISABLED PERMAMENTLY - OK to remove from code // $options = strtolower($options); // Check Options Parameter if (!ereg( '^([sHSEFuPaIpfqr][+?-])*$', $options )) { trigger_error("Options attribute malformed", E_USER_ERROR); } // Set Options Array, set defaults if options are not specified // Scheme if (strpos( $options, 's') === false) $aOptions['s'] = '?'; else $aOptions['s'] = substr( $options, strpos( $options, 's') + 1, 1); // http:// if (strpos( $options, 'H') === false) $aOptions['H'] = '?'; else $aOptions['H'] = substr( $options, strpos( $options, 'H') + 1, 1); // https:// (SSL) if (strpos( $options, 'S') === false) $aOptions['S'] = '?'; else $aOptions['S'] = substr( $options, strpos( $options, 'S') + 1, 1); // mailto: (email) if (strpos( $options, 'E') === false) $aOptions['E'] = '-'; else $aOptions['E'] = substr( $options, strpos( $options, 'E') + 1, 1); // ftp:// if (strpos( $options, 'F') === false) $aOptions['F'] = '-'; else $aOptions['F'] = substr( $options, strpos( $options, 'F') + 1, 1); // User section if (strpos( $options, 'u') === false) $aOptions['u'] = '?'; else $aOptions['u'] = substr( $options, strpos( $options, 'u') + 1, 1); // Password in user section if (strpos( $options, 'P') === false) $aOptions['P'] = '?'; else $aOptions['P'] = substr( $options, strpos( $options, 'P') + 1, 1); // Address Section if (strpos( $options, 'a') === false) $aOptions['a'] = '+'; else $aOptions['a'] = substr( $options, strpos( $options, 'a') + 1, 1); // IP Address in address section if (strpos( $options, 'I') === false) $aOptions['I'] = '?'; else $aOptions['I'] = substr( $options, strpos( $options, 'I') + 1, 1); // Port number if (strpos( $options, 'p') === false) $aOptions['p'] = '?'; else $aOptions['p'] = substr( $options, strpos( $options, 'p') + 1, 1); // File Path if (strpos( $options, 'f') === false) $aOptions['f'] = '?'; else $aOptions['f'] = substr( $options, strpos( $options, 'f') + 1, 1); // Query Section if (strpos( $options, 'q') === false) $aOptions['q'] = '?'; else $aOptions['q'] = substr( $options, strpos( $options, 'q') + 1, 1); // Fragment (Anchor) if (strpos( $options, 'r') === false) $aOptions['r'] = '?'; else $aOptions['r'] = substr( $options, strpos( $options, 'r') + 1, 1); // Loop through options array, to search for and replace "-" to "{0}" and "+" to "" foreach($aOptions as $key => $value) { if ($value == '-') { $aOptions[$key] = '{0}'; } if ($value == '+') { $aOptions[$key] = ''; } } // DEBUGGING - Unescape following line to display to screen current option values // echo '</pre><pre><pre>'; print_r($aOptions); echo '</pre>'; // Preset Allowed Characters $alphanum = '[a-zA-Z0-9]'; // Alpha Numeric $unreserved = '[a-zA-Z0-9_.!~*' . '\'' . '()-]'; $escaped = '(%[0-9a-fA-F]{2})'; // Escape sequence - In Hex - %6d would be a 'm' $reserved = '[;/?:@&=+$,]'; // Special characters in the URI // Beginning Regular Expression // Scheme - Allows for 'http://', 'https://', 'mailto:', or 'ftp://' $scheme = '('; if ($aOptions['H'] === '') { $scheme .= 'http://'; } elseif ($aOptions['S'] === '') { $scheme .= 'https://'; } elseif ($aOptions['E'] === '') { $scheme .= 'mailto:'; } elseif ($aOptions['F'] === '') { $scheme .= 'ftp://'; } else { if ($aOptions['H'] === '?') { $scheme .= '|(http://)'; } if ($aOptions['S'] === '?') { $scheme .= '|(https://)'; } if ($aOptions['E'] === '?') { $scheme .= '|(mailto:)'; } if ($aOptions['F'] === '?') { $scheme .= '|(ftp://)'; } $scheme = str_replace('(|', '(', $scheme); // fix first pipe } $scheme .= ')' . $aOptions['s']; // End setting scheme // User Info - Allows for 'username@' or 'username:password@'. Note: contrary to rfc, I removed ':' from username section, allowing it only in password. // /---------------- Username -----------------------\ /-------------------------------- Password ------------------------------\ $userinfo = '((' . $unreserved . '|' . $escaped . '|[;&=+$,]' . ')+(:(' . $unreserved . '|' . $escaped . '|[;:&=+$,]' . ')+)' . $aOptions['P'] . '@)' . $aOptions['u']; // IP ADDRESS - Allows 0.0.0.0 to 255.255.255.255 $ipaddress = '((((2(([0-4][0-9])|(5[0-5])))|([01]?[0-9]?[0-9]))\.){3}((2(([0-4][0-9])|(5[0-5])))|([01]?[0-9]?[0-9])))'; // Tertiary Domain(s) - Optional - Multi - Although some sites may use other characters, the RFC says tertiary domains have the same naming restrictions as second level domains $domain_tertiary = '(' . $alphanum . '(([a-zA-Z0-9-]{0,62})' . $alphanum . ')?\.)*'; // Second Level Domain - Required - First and last characters must be Alpha-numeric. Hyphens are allowed inside. $domain_secondary = '(' . $alphanum . '(([a-zA-Z0-9-]{0,62})' . $alphanum . ')?\.)'; /* // This regex is disabled on purpose in favour of the more exact version below // Top Level Domain - First character must be Alpha. Last character must be AlphaNumeric. Hyphens are allowed inside. $domain_toplevel = '([a-zA-Z](([a-zA-Z0-9-]*)[a-zA-Z0-9])?)'; */ // Top Level Domain - Required - Domain List Current As Of August 2004. Use above escaped line to be forgiving of possible future TLD's $domain_toplevel = '(aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org|pro|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|az|ax|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)'; // Address can be IP address or Domain if ($aOptions['I'] === '{0}') { // IP Address Not Allowed $address = '(' . $domain_tertiary . $domain_secondary . $domain_toplevel . ')'; } elseif ($aOptions['I'] === '') { // IP Address Required $address = '(' . $ipaddress . ')'; } else { // IP Address Optional $address = '((' . $ipaddress . ')|(' . $domain_tertiary . $domain_secondary . $domain_toplevel . '))'; } $address = $address . $aOptions['a']; // Port Number - :80 or :8080 or :65534 Allows range of :0 to :65535 // (0-59999) |(60000-64999) |(65000-65499) |(65500-65529) |(65530-65535) $port_number = '(:(([0-5]?[0-9]{1,4})|(6[0-4][0-9]{3})|(65[0-4][0-9]{2})|(655[0-2][0-9])|(6553[0-5])))' . $aOptions['p']; // Path - Can be as simple as '/' or have multiple folders and filenames $path = '(/((;)?(' . $unreserved . '|' . $escaped . '|' . '[:@&=+$,]' . ')+(/)?)*)' . $aOptions['f']; // Query Section - Accepts ?var1=value1&var2=value2 or ?2393,1221 and much more $querystring = '(\?(' . $reserved . '|' . $unreserved . '|' . $escaped . ')*)' . $aOptions['q']; // Fragment Section - Accepts anchors such as #top $fragment = '(#(' . $reserved . '|' . $unreserved . '|' . $escaped . ')*)' . $aOptions['r']; // Building Regular Expression $regexp = '^' . $scheme . $userinfo . $address . $port_number . $path . $querystring . $fragment . '$'; // DEBUGGING - Uncomment Line Below To Display The Regular Expression Built // echo '<pre>' . htmlentities(wordwrap($regexp,70,"\n",1)) . '</pre></pre><pre>'; // Running the regular expression if (eregi( $regexp, $urladdr )) { return true; // The domain passed } else { return false; // The domain didn't pass the expression }
}// END Function validateUrlSyntax()
?>