在采集程序或者蜘蛛程序中经常会遇到一类问题,就是将网页中相对路径形式的URL转换为绝对路径形式的URL。例如在http://www.youkud.com/b log/1/这个页面中,有一个URL链接为../index.php,那么我们要将它转换为http://www.youkud.com/blog/index.php。下面给出了解决这类问题的代码。该程序能够成功处理各种URL,将其变成绝对形式。
<?php /** * 将一个URL转换为完整URL */ function format_url($srcurl, $baseurl) { $srcinfo = parse_url($srcurl); if(isset($srcinfo['scheme'])) { return $srcurl; } $baseinfo = parse_url($baseurl); $url = $baseinfo['scheme'].'://'.$baseinfo['host']; if(substr($srcinfo['path'], 0, 1) == '/') { $path = $srcinfo['path']; }else{ $path = dirname($baseinfo['path']).'/'.$srcinfo['path']; } $rst = array(); $path_array = explode('/', $path); if(!$path_array[0]) { $rst[] = ''; } foreach ($path_array AS $key => $dir) { if ($dir == '..') { if (end($rst) == '..') { $rst[] = '..'; }elseif(!array_pop($rst)) { $rst[] = '..'; } }elseif($dir && $dir != '.') { $rst[] = $dir; } } if(!end($path_array)) { $rst[] = ''; } $url .= implode('/', $rst); return str_replace('\\', '/', $url); } $srcurl = '/guestbook.php'; $baseurl = 'http://www.youkud.com/index.php/ddd.html'; echo format_url($srcurl, $baseurl);