Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Finding URL redirects with HTTP headers and curl?

I'm trying to code a redirect checker, to check if a URL is search engine friendly. It has to check if a URL is redirected or not, and if it's redirected it has to tell if it's SEO friendly (301 status code) or not (302/304).

Here's something similiar I've found: http://www.webconfs.com/redirect-check.php

It also should be able to follow multiple redirects (e.g. from A to B to C) and tell me that A redirects to C.

This is what I got so far, but it doesn't work quite right (example: when typing in www.example.com it doesnt find the redirect to www.example.com/page1)

<?php
// You can edit the messages of the respective code over here
$httpcode  = array();
$httpcode["200"] = "Ok";
$httpcode["201"] = "Created";
$httpcode["302"] = "Found";
$httpcode["301"] = "Moved Permanently";
$httpcode["304"] = "Not Modified";
$httpcode["400"] = "Bad Request";


if(count($_POST)>0)
{
    $url = $_POST["url"];
    $curlurl = "http://".$url."/";
    $ch = curl_init();
    // Set URL to download
    curl_setopt($ch, CURLOPT_URL, $curlurl);

    // User agent
    curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER["HTTP_USER_AGENT"]);
    // Include header in result? (0 = yes, 1 = no)
    curl_setopt($ch, CURLOPT_HEADER, 0);

    // Should cURL return or print out the data? (true = return, false = print)
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

    // Timeout in seconds
    curl_setopt($ch, CURLOPT_TIMEOUT, 15);

    // Download the given URL, and return output
    $output = curl_exec($ch);

    $curlinfo = curl_getinfo($ch);

    if(($curlinfo["http_code"]=="301") || ($curlinfo["http_code"]=="302"))
    {
        $ch = curl_init();
        // Set URL to download
        curl_setopt($ch, CURLOPT_URL, $curlurl);

        // User agent
        curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER["HTTP_USER_AGENT"]);
        // Include header in result? (0 = yes, 1 = no)
        curl_setopt($ch, CURLOPT_HEADER, 0);

        // Should cURL return or print out the data? (true = return, false = print)
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

        // Timeout in seconds
        curl_setopt($ch, CURLOPT_TIMEOUT, 15);


        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        // Download the given URL, and return output
        $output = curl_exec($ch);

        $curlinfo = curl_getinfo($ch);
        echo $url." is redirected to ".$curlinfo["url"];
    }
    else
    {
        echo $url." is not getting redirected";
    }

    // Close the cURL resource, and free system resources
    curl_close($ch);
}
?>
<form action="" method="post">
http://<input type="text" name="url" size="30" />/ <b>e.g. www.google.com</b><br/>
<input type="submit" value="Submit" />
</form>
like image 628
deltitnu Avatar asked Dec 16 '11 12:12

deltitnu


1 Answers

Well if you want to record every redirect you have to implement it yourself and turn off the automatic "location following":

function curl_trace_redirects($url, $timeout = 15) {

    $result = array();
    $ch = curl_init();

    $trace = true;
    $currentUrl = $url;

    $urlHist = array();
    while($trace && $timeout > 0 && !isset($urlHist[$currentUrl])) {
        $urlHist[$currentUrl] = true;

        curl_setopt($ch, CURLOPT_URL, $currentUrl);
        curl_setopt($ch, CURLOPT_HEADER, true);
        curl_setopt($ch, CURLOPT_NOBODY, true);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
        curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);

        $output = curl_exec($ch);

        if($output === false) {
            $traceItem = array(
                'errorno' => curl_errno($ch),
                'error' => curl_error($ch),
            );

            $trace = false;
        } else {
            $curlinfo = curl_getinfo($ch);

            if(isset($curlinfo['total_time'])) {
                $timeout -= $curlinfo['total_time'];
            }

            if(!isset($curlinfo['redirect_url'])) {
                $curlinfo['redirect_url'] = get_redirect_url($output);
            }

            if(!empty($curlinfo['redirect_url'])) {
                $currentUrl = $curlinfo['redirect_url'];
            } else {
                $trace = false;
            }

            $traceItem = $curlinfo;
        }

        $result[] = $traceItem;
    }

    if($timeout < 0) {
        $result[] = array('timeout' => $timeout);
    }

    curl_close($ch);

    return $result;
}

// apparently 'redirect_url' is not available on all curl-versions
// so we fetch the location header ourselves
function get_redirect_url($header) {
    if(preg_match('/^Location:\s+(.*)$/mi', $header, $m)) {
        return trim($m[1]);
    }

    return "";
}

And you use it like that:

$res = curl_trace_redirects("http://www.example.com");
foreach($res as $item) {
    if(isset($item['timeout'])) {
        echo "Timeout reached!\n";
    } else if(isset($item['error'])) {
        echo "error: ", $item['error'], "\n";
    } else {
        echo $item['url'];
        if(!empty($item['redirect_url'])) {
            // redirection
            echo " -> (", $item['http_code'], ")";
        }

        echo "\n";
    }
}

It's possible that my code isn't fully thought out, but I guess it's a good start.

Edit

Here's some sample Output:

http://midas/~stefan/test/redirect/fritzli.html -> (302)
http://midas/~stefan/test/redirect/hansli.html -> (301)
http://midas/~stefan/test/redirect/heiri.html
like image 131
vstm Avatar answered Sep 23 '22 06:09

vstm