Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

curl: can't fetch rss from website because of CloudFlare

I'm notable to connect this site http://www.youm7.com/newtkarirrss.asp using curl on the server

But i can access it from localhost with out any problem

Here is the test

http://www.tjreb.com/xml_grabber.php?feed=http://www.youm7.com/newtkarirrss.asp&stack=1

Try The CNN rss feed

http://www.tjreb.com/xml_grabber.php?feed=http://rss.cnn.com/rss/edition_meast.rss&stack=0

How can i bypass this error

Here is my source code

<?php
  class xml_grabber
        {
            private $xml_file       = '' ;
            private $xml_link       = '' ;
            private $xml_dom        = '' ;
            private $xml_type       = '' ;
            private $xml_content    = '' ;
            private $xml_errors     = array() ;
            public  $xml_stack      = 0  ;

            public function __construct($link_file_com = '')
                   {
                       if(!$link_file_com)
                            {
                              $this->xml_errors['construct'] = 'No Xml In Construct' ;
                              return false;
                            }
                       elseif(!function_exists('simplexml_load_file') || !function_exists('simplexml_load_string') || !function_exists('simplexml_import_dom'))
                            {
                              $this->xml_errors['functions'] = 'simple xml function not exists' ;
                              return false;
                            }
                       else
                            {
                             $this->set_xml($link_file_com) ;
                            }
                      // ini_set('memory_limit', '100M');
                   }

             public function set_xml($xml)
                   {
                      if(isset($xml{3}))
                        {
                           if(file_exists($xml))
                              {
                                $this->xml_type = 1 ;
                                $this->xml_file = $xml ;
                              }
                          elseif(filter_var($xml, FILTER_VALIDATE_URL))
                              {
                                $this->xml_type = 2 ;
                                $this->xml_link = $xml ;
                              }
                          else
                              {
                                $this->xml_type = 3 ;
                                $this->xml_dom  = $xml ;
                              }
                        }
                      else
                        {
                          $this->xml_type = '' ;
                        }
                   }

             public function get_xml()
                   {
                      if($this->xml_type == '')
                            {
                              return false ;
                            }
                      elseif($this->xml_type == 1)
                            {
                              return $this->xml_file ;
                            }
                      elseif($this->xml_type == 2)
                            {
                              return $this->xml_link ;
                            }
                      elseif($this->xml_type == 3)
                            {
                              return $this->xml_dom ;
                            }
                   }

             public function set_columns($new_columns= array())
                   {
                      return $this->xml_columns = $new_columns ;
                   }
             public function get_columns()
                   {
                     return $this->xml_columns ;
                   }

             public function load()
                   {
                     if($this->xml_type == '')
                            {
                              $this->xml_errors['loader'] = 'Unknown XML type' ;
                              return false;
                            }
                      elseif($this->xml_type == 1)
                            {
                              $dom = simplexml_load_file($this->xml_file,null, LIBXML_NOCDATA) ;
                              $this->xml_content = $dom ;
                            }
                      elseif($this->xml_type == 2)
                            {
                               $con = $this->connect($this->xml_link);
                               if($this->xml_stack == 1)
                                    {
                                       echo $con; die();
                                    }       
                               $this->xml_content = simplexml_load_string($con,null, LIBXML_NOCDATA) ;
                            }
                      elseif($this->xml_type == 3)
                            {
                              return $this->xml_dom ;
                            }
                   }

             public function fetch($return = 'array')
                    {
                        if($this->xml_content != '')
                            {
                               $rss_feed = $this->xml_content ;

                               $rss_title = (string) $rss_feed->channel->title ;
                               $rss_link  = (string) $rss_feed->channel->link  ;
                               $rss_cat   = (string) $rss_feed->channel->category  ;
                               $rss_image = (string) $rss_feed->channel->image->url  ;

                               $rss_summary =
                                            array
                                            (
                                              'info' =>
                                                        array(
                                                                'title'=>$rss_title ,
                                                                'link'=>$rss_link ,
                                                                'cat'=>$rss_cat ,
                                                                'image'=>$rss_image
                                                                ) ,
                                              'item' =>  array()

                                            ) ;



                               foreach($rss_feed->channel->item as $item)
                                       {

                                           if($item->enclosure && $item->enclosure->attributes())
                                                {
                                                    $image0 = $item->enclosure->attributes() ;
                                                    $image_url = $image0 ['url'] ;
                                                }

                                          $rss_summary['item'][] =
                                                                    array(
                                                                        'title' => (string) $item->title ,
                                                                        'description' => (string) $item->description ,
                                                                        'link' => (string) $item->link ,
                                                                        'date' => (string) $item->pubDate ,
                                                                        'image' => (string) $item->image ,
                                                                        'image2' =>  (string) $image0
                                                                    ) ;
                                       }

                                if($return == 'json')
                                       {
                                         return json_encode($rss_summary) ;
                                       }
                                elseif($return == 'serialize')
                                       {
                                         return serialize($rss_summary) ;
                                       }
                                elseif($return == 'xml')
                                       {
                                         return xml_encode($rss_summary) ;
                                       }
                                else
                                       {
                                         return $rss_summary ;
                                       }

                            }
                        else
                            {
                              $this->xml_errors['fetch'] = 'No Xml Content' ;
                            }
                    }

             protected function connect($link)
                    {
                      if(!filter_var($link, FILTER_VALIDATE_URL))
                              {
                                $this->xml_errors['connect'] = 'Not Vaild Link To Get data' ;
                                return false ;
                              }
                      if(function_exists('curl_init'))
                           {
                             $cu = curl_init();
                             curl_setopt($cu, CURLOPT_URL, $link);
                             curl_setopt($cu, CURLOPT_SSL_VERIFYPEER, false);
                             curl_setopt($cu, CURLOPT_SSL_VERIFYHOST, false);
                             //curl_setopt($cu, CURLOPT_REFERER, "http://www.tjreb.com");
                             //curl_setopt($cu, CURLOPT_HEADER, true);
                 //curl_setopt($cu, CURLOPT_FOLLOWLOCATION, false);
                 curl_setopt($cu, CURLOPT_RETURNTRANSFER, TRUE);
                             $co = curl_exec($cu) ;
                               if($co)
                                    {
                                        $con = $co ;
                                    }
                               else
                                    {
                                      $this->xml_errors['connect'] = 'No Result From Curl' ;
                                      $this->xml_errors['curl']  = curl_error($cu);
                                    }
                              curl_close($cu) ;
                              return $con ;
                           }

                    if(!$con and function_exists('ini_get'))
                        {

                             $url_fopen = ini_get('allow_url_fopen') ;

                             if($url_fopen == 0)
                                {
                                   if(function_exists('ini_set'))
                                        {
                                          ini_set('allow_url_fopen', 1) ;
                                        }
                                   $check_fopen = 1 ;
                                }
                             else
                                {
                                   $check_fopen = 0 ;
                                }

                             if($check_fopen == 1)
                                {
                                  $url_fopen = ini_get('allow_url_fopen') ;
                                }

                             if($url_fopen == 1)
                                {

                                  if(function_exists('file_get_contents') and !$con)
                                   {
                                        $con = @file_get_contents($link) ;
                                        if($con)
                                            {
                                              return $con ;
                                            }
                                       else
                                            {
                                              $this->xml_errors['connect'] = 'No Result From file_get_contents' ;
                                            }
                                   }

                              elseif(function_exists('readfile') and !$con)
                                   {
                                        $con = @readfile($link);
                                        if($con)
                                            {
                                              return $con ;
                                            }
                                       else
                                            {
                                              $this->xml_errors['connect'] = 'No Result From readfile' ;
                                            }
                                   }

                              elseif(function_exists('file') and !$con)
                                   {
                                        $con = @file($link)  ;
                                        if($con)
                                            {
                                              return  $con ;
                                            }
                                       else
                                            {
                                              $this->xml_errors['connect'] = 'No Result From file' ;
                                            }
                                   }

                                }
                        }

                      if(!$con)
                           {
                             $this->xml_errors['connect'] = 'Curl And Allow Url Fopen Disabled On Server' ;
                             return false ;
                           }
                    }

             public function get_error()
                    {
                       return $this->xml_errors ;
                    }

        }


if(isset($_GET['feed']))
    {
       $url = addslashes($_GET['feed']) ;
    }
else
    {
       $url = 'http://rss.cnn.com/rss/edition_meast.rss' ;
    }

$fetch = $_GET['fetch'] ;
$stack = $_GET['stack'] ;

$xml = new xml_grabber($url) ;

/*
 http://www.youm7.com/new3agelrss.asp
 http://www.youm7.com/newtkarirrss.asp
 http://www.almasryalyoum.com/rss_feed_term/223241/rss.xml
 http://gdata.youtube.com/feeds/api/playlists/18A7E36C33EF4B5D?v=2
 http://rss.cnn.com/rss/edition_meast.rss
 https://www.facebook.com/feeds/page.php?format=atom10&id=40796308305
 https://www.facebook.com/feeds/page.php?format=rss20&id=40796308305
 http://www.fwasl.com/feed
 https://www.facebook.com/feeds/page.php?format=atom10&id=378156838895039
 */

if(isset($stack))
{
  $xml -> xml_stack = intval($stack) ; 
}   


$res    = $xml -> load()    ;

$result = $xml -> fetch($fetch)   ;

if($result)
    {
      print_r ( $result ) ;
    }
else
    {
      print_r ( $xml->get_error() ) ;
    }

?>
like image 777
Mona Abdelmajeed Avatar asked Aug 09 '12 15:08

Mona Abdelmajeed


2 Answers

You can't easily bypass Cloudflare. However you can hack the protection system. :)

First, parse the page (Cloudflare protection page) and calculate 3+13*7 (most probably this will be different for each request.) in

$(function(){setTimeout(
            function(){
                $('#jschl_answer').val(3+13*7);
                $('#ChallengeForm').submit();
            },
            5850
)});

Then send post request the same page with "jschl_vc" value from #ChallengeForm which you got from parsed data and "jschl_answer" value as 3+13*7. And then try to fetch the page again with the cookie value that Cloudflare added. When you're added Cloudflare whitelist, you won't see that page anymore.

like image 200
burak emre Avatar answered Sep 30 '22 12:09

burak emre


You can pass cloudflare protection with PhantomJS http://phantomjs.org/ which can execute the cloudflare JS outside a browser with following little script "delay.js":

"use strict";
var page = require('webpage').create(),
    system = require('system'),
    address, delay;

if (system.args.length < 3 || system.args.length > 5) {
    console.log('Usage: delay.js URL delay');
    phantom.exit(1);
} else {
    address = system.args[1];
    delay = system.args[2];
    page.open(address, function (status) {
        if (status !== 'success') {
            console.log('Unable to load the address!');
            phantom.exit(1);
        } else {
            window.setTimeout(function () {
                var content = page.content;
                console.log(content);
                phantom.exit();
            }, delay);
        }
    });
}

run it as phantomjs delay.js http://protected.url 5000

This will get "protected.url" and wait 5000ms for the cloudflare code to load the real page and dumps it to stdout.

like image 34
andree Avatar answered Sep 30 '22 14:09

andree