Pages

Wednesday, December 26, 2012

How to implement a link scraper in PHP? - [resolved]


$html = file_get_contents("http://example.com");

$dom = new DOMDocument();
@$dom->loadHTML($html);

$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");

$list_urls = array();
$list_urlval = array();
for ($i = 0; $i < $hrefs->length; $i++) {
        $nValue = $hrefs->item($i);   
        $href = $nValue->getAttribute('href');
        $value = $nValue->nodeValue;
       
        if($href != '' && (!preg_match("/#/", $href)) && $href != '/' && (!preg_match("/javascript/", $href)) && (!preg_match("/mailto/", $href)) && (!preg_match("/plus.google/", $href))){
       
            if((!preg_match("/http/", $href)))
                $href = $urlname.'/'.$href;
               
            $list_urls[] = $href;
            $list_urlval[] = $value;
        }
}

print_r(array_unique($list_urls));


No comments: