<?php
$urls = getUrls(); // some function getting say 10 or more external links
foreach($urls as $k=>$url){
// this could potentially take 0-30 seconds each
// (more or less depending on connection, target site, timeout settings...)
if( ! isValidUrl($url) ){
unset($urls[$k]);
}
}
echo "yay all done! now show my site";
foreach($urls as $url){
echo "<a href=\"{$url}\">{$url}</a><br/>";
}
下面的函数可能会有所帮助,您可能需要修改它们以满足您的需要:
function isValidUrl($url){
// first do some quick sanity checks:
if(!$url || !is_string($url)){
return false;
}
// quick check url is roughly a valid http request: ( http://blah/... )
if( ! preg_match('/^http(s)?:\/\/[a-z0-9-]+(\.[a-z0-9-]+)*(:[0-9]+)?(\/.*)?$/i', $url) ){
return false;
}
// the next bit could be slow:
if(getHttpResponseCode_using_curl($url) != 200){
// if(getHttpResponseCode_using_getheaders($url) != 200){ // use this one if you cant use curl
return false;
}
// all good!
return true;
}
function getHttpResponseCode_using_curl($url, $followredirects = true){
// returns int responsecode, or false (if url does not exist or connection timeout occurs)
// NOTE: could potentially take up to 0-30 seconds , blocking further code execution (more or less depending on connection, target site, and local timeout settings))
// if $followredirects == false: return the FIRST known httpcode (ignore redirects)
// if $followredirects == true : return the LAST known httpcode (when redirected)
if(! $url || ! is_string($url)){
return false;
}
$ch = @curl_init($url);
if($ch === false){
return false;
}
@curl_setopt($ch, CURLOPT_HEADER ,true); // we want headers
@curl_setopt($ch, CURLOPT_NOBODY ,true); // dont need body
@curl_setopt($ch, CURLOPT_RETURNTRANSFER ,true); // catch output (do NOT print!)
if($followredirects){
@curl_setopt($ch, CURLOPT_FOLLOWLOCATION ,true);
@curl_setopt($ch, CURLOPT_MAXREDIRS ,10); // fairly random number, but could prevent unwanted endless redirects with followlocation=true
}else{
@curl_setopt($ch, CURLOPT_FOLLOWLOCATION ,false);
}
// @curl_setopt($ch, CURLOPT_CONNECTTIMEOUT ,5); // fairly random number (seconds)... but could prevent waiting forever to get a result
// @curl_setopt($ch, CURLOPT_TIMEOUT ,6); // fairly random number (seconds)... but could prevent waiting forever to get a result
// @curl_setopt($ch, CURLOPT_USERAGENT ,"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"); // pretend we're a regular browser
@curl_exec($ch);
if(@curl_errno($ch)){ // should be 0
@curl_close($ch);
return false;
}
$code = @curl_getinfo($ch, CURLINFO_HTTP_CODE); // note: php.net documentation shows this returns a string, but really it returns an int
@curl_close($ch);
return $code;
}
function getHttpResponseCode_using_getheaders($url, $followredirects = true){
// returns string responsecode, or false if no responsecode found in headers (or url does not exist)
// NOTE: could potentially take up to 0-30 seconds , blocking further code execution (more or less depending on connection, target site, and local timeout settings))
// if $followredirects == false: return the FIRST known httpcode (ignore redirects)
// if $followredirects == true : return the LAST known httpcode (when redirected)
if(! $url || ! is_string($url)){
return false;
}
$headers = @get_headers($url);
if($headers && is_array($headers)){
if($followredirects){
// we want the last errorcode, reverse array so we start at the end:
$headers = array_reverse($headers);
}
foreach($headers as $hline){
// search for things like "HTTP/1.1 200 OK" , "HTTP/1.0 200 OK" , "HTTP/1.1 301 PERMANENTLY MOVED" , "HTTP/1.1 400 Not Found" , etc.
// note that the exact syntax/version/output differs, so there is some string magic involved here
if(preg_match('/^HTTP\/\S+\s+([1-9][0-9][0-9])\s+.*/', $hline, $matches) ){// "HTTP/*** ### ***"
$code = $matches[1];
return $code;
}
}
// no HTTP/xxx found in headers:
return false;
}
// no headers :
return false;
}
在确定 php 中的 url 是否存在时,需要注意以下几点:
请记住,无论您使用什么方法,等待响应都需要时间。
所有代码可能(并且可能会)停止,直到您知道结果或请求超时。
例如:如果网址无效或无法访问,下面的代码可能需要很长时间才能显示页面:
<?php $urls = getUrls(); // some function getting say 10 or more external links foreach($urls as $k=>$url){ // this could potentially take 0-30 seconds each // (more or less depending on connection, target site, timeout settings...) if( ! isValidUrl($url) ){ unset($urls[$k]); } } echo "yay all done! now show my site"; foreach($urls as $url){ echo "<a href=\"{$url}\">{$url}</a><br/>"; }下面的函数可能会有所帮助,您可能需要修改它们以满足您的需要:
function isValidUrl($url){ // first do some quick sanity checks: if(!$url || !is_string($url)){ return false; } // quick check url is roughly a valid http request: ( http://blah/... ) if( ! preg_match('/^http(s)?:\/\/[a-z0-9-]+(\.[a-z0-9-]+)*(:[0-9]+)?(\/.*)?$/i', $url) ){ return false; } // the next bit could be slow: if(getHttpResponseCode_using_curl($url) != 200){ // if(getHttpResponseCode_using_getheaders($url) != 200){ // use this one if you cant use curl return false; } // all good! return true; } function getHttpResponseCode_using_curl($url, $followredirects = true){ // returns int responsecode, or false (if url does not exist or connection timeout occurs) // NOTE: could potentially take up to 0-30 seconds , blocking further code execution (more or less depending on connection, target site, and local timeout settings)) // if $followredirects == false: return the FIRST known httpcode (ignore redirects) // if $followredirects == true : return the LAST known httpcode (when redirected) if(! $url || ! is_string($url)){ return false; } $ch = @curl_init($url); if($ch === false){ return false; } @curl_setopt($ch, CURLOPT_HEADER ,true); // we want headers @curl_setopt($ch, CURLOPT_NOBODY ,true); // dont need body @curl_setopt($ch, CURLOPT_RETURNTRANSFER ,true); // catch output (do NOT print!) if($followredirects){ @curl_setopt($ch, CURLOPT_FOLLOWLOCATION ,true); @curl_setopt($ch, CURLOPT_MAXREDIRS ,10); // fairly random number, but could prevent unwanted endless redirects with followlocation=true }else{ @curl_setopt($ch, CURLOPT_FOLLOWLOCATION ,false); } // @curl_setopt($ch, CURLOPT_CONNECTTIMEOUT ,5); // fairly random number (seconds)... but could prevent waiting forever to get a result // @curl_setopt($ch, CURLOPT_TIMEOUT ,6); // fairly random number (seconds)... but could prevent waiting forever to get a result // @curl_setopt($ch, CURLOPT_USERAGENT ,"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"); // pretend we're a regular browser @curl_exec($ch); if(@curl_errno($ch)){ // should be 0 @curl_close($ch); return false; } $code = @curl_getinfo($ch, CURLINFO_HTTP_CODE); // note: php.net documentation shows this returns a string, but really it returns an int @curl_close($ch); return $code; } function getHttpResponseCode_using_getheaders($url, $followredirects = true){ // returns string responsecode, or false if no responsecode found in headers (or url does not exist) // NOTE: could potentially take up to 0-30 seconds , blocking further code execution (more or less depending on connection, target site, and local timeout settings)) // if $followredirects == false: return the FIRST known httpcode (ignore redirects) // if $followredirects == true : return the LAST known httpcode (when redirected) if(! $url || ! is_string($url)){ return false; } $headers = @get_headers($url); if($headers && is_array($headers)){ if($followredirects){ // we want the last errorcode, reverse array so we start at the end: $headers = array_reverse($headers); } foreach($headers as $hline){ // search for things like "HTTP/1.1 200 OK" , "HTTP/1.0 200 OK" , "HTTP/1.1 301 PERMANENTLY MOVED" , "HTTP/1.1 400 Not Found" , etc. // note that the exact syntax/version/output differs, so there is some string magic involved here if(preg_match('/^HTTP\/\S+\s+([1-9][0-9][0-9])\s+.*/', $hline, $matches) ){// "HTTP/*** ### ***" $code = $matches[1]; return $code; } } // no HTTP/xxx found in headers: return false; } // no headers : return false; }这里:
$file = 'http://www.example.com/somefile.jpg'; $file_headers = @get_headers($file); if(!$file_headers || $file_headers[0] == 'HTTP/1.1 404 Not Found') { $exists = false; } else { $exists = true; }来自此处和在上面的帖子下面,有一个 curl 解决方案:
function url_exists($url) { return curl_init($url) !== false; }