Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Getting "raw" jss and css code from server using Chrome/Chromium using WWW::Mechanize::Chrome

I'm trying to use WWW::Mechanize::Chrome to download css/js files. Yes, there are other ways to get the files. But my requirement is this be done with WWW::Mechanize::Chrome. I want to know if it's possible.

I can do $mech->get($url) to the css or js file. It then shows up in the browser window which I can then obtain with $mech->content. The problem is it that the HTML entities are encoded and decoding them results in a different file than the original (I tested this). This is a problem with js files. They don't run properly afterwards.

You can run this test script to see the files getting encoded.

use strict;
use warnings;
use WWW::Mechanize::Chrome;

my $mech =  WWW::Mechanize::Chrome->new();

$mech->get('https://www.nytimes.com/vi-assets/static-assets/vendor-454814a0340940dc9b42.js');
my $content = $mech->content;
use Data::Dumper qw(Dumper);
print Dumper $content;

I'm wondering if there is some kind of workaround to snag these files directly from the server. Again, must use WWW::Mechanize::Chrome.

like image 385
StevieD Avatar asked Dec 28 '25 09:12

StevieD


2 Answers

If nothing else, you could inject a script that downloads the file for you.

This following demonstrates this approach using Selenium::Chrome, but the approach can be adapted to WWW::Mechanize::Chrome.

use strict;
use warnings qw( all );

use FindBin             qw( $RealBin );    
use MIME::Base64        qw( decode_base64 );
use Selenium::Chrome    qw( );
use Time::HiRes         qw( sleep );
use Sub::ScopeFinalizer qw( scope_finalizer );

# nf = Non-fatal.
sub nf_find_element {
   my $web_driver = shift;
   my $node;
   if (!eval {
      $node = $web_driver->find_element(@_);
      return 1;  # No exception.
   }) {
      return undef if $@ =~ /Unable to locate element|An element could not be located on the page using the given search parameters/;
      die($@);
   }

   return $node;
}

sub nf_find_elements {
   my $web_driver = shift;
   my $nodes;
   if (!eval {
      $nodes = $web_driver->find_elements(@_);
      return 1;  # No exception.
   }) {
      return undef if $@ =~ /Unable to locate element|An element could not be located on the page using the given search parameters/;
      die($@);
   }

   return wantarray ? @$nodes : $nodes;
}

sub nf_find_child_element {
   my $web_driver = shift;
   my $node;
   if (!eval {
      $node = $web_driver->find_child_element(@_);
      return 1;  # No exception.
   }) {
      return undef if $@ =~ /Unable to locate element|An element could not be located on the page using the given search parameters/;
      die($@);
   }

   return $node;
}

sub nf_find_child_elements {
   my $web_driver = shift;
   my $nodes;
   if (!eval {
      $nodes = $web_driver->find_child_elements(@_);
      return 1;  # No exception.
   }) {
      return undef if $@ =~ /Unable to locate element|An element could not be located on the page using the given search parameters/;
      die($@);
   }

   return wantarray ? @$nodes : $nodes;
}

# Warning: This clears the log.
sub has_js_failed {
   my ($web_driver) = @_;
   my $log = $web_driver->get_log('browser');
   return 0+grep { no warnings qw( uninitialized ); $_->{level} eq 'SEVERE' && $_->{source} eq 'javascript' } @$log;
}

{
   my $js = <<'__EOS__';
      var array_buffer_to_base64 = function(buf) {
         let binary = '';
         let bytes = new Uint8Array(buf);
         for (let byte of bytes) {
            binary += String.fromCharCode(byte);
         }

         return btoa(binary);
      };

      var set_response = function(code, msg) {
         let code_node = document.createElement('input');
         code_node.setAttribute('type', 'hidden');
         code_node.setAttribute('name', 'code');
         code_node.setAttribute('value', code);

         let msg_node = document.createElement('input');
         msg_node.setAttribute('type', 'hidden');
         msg_node.setAttribute('name', 'msg');
         msg_node.setAttribute('value', msg);

         let form_node = document.createElement('form');
         form_node.setAttribute('id', 'exit');
         form_node.appendChild(code_node);
         form_node.appendChild(msg_node);

         document.body.appendChild(form_node);
      };

      var request = function(url) {
         fetch(url)
            .then(
               response => {
                  if (!response.ok)
                     throw new Error("HTTP error: " + response.status);

                  return response.arrayBuffer();
               }
            )
            .then(
               buffer => set_response("success", array_buffer_to_base64(buffer)),
               reason => set_response("error",   reason),
            );
      };

      request(...arguments);
__EOS__

   my $web_driver;
   my $guard = scope_finalizer {
      if ($web_driver) {
         $web_driver->shutdown_binary();
         $web_driver = undef;
      }
   };

   $web_driver = Selenium::Chrome->new(
      binary => "$RealBin/chromedriver.exe",
   );

   $web_driver->get('https://www.nytimes.com/');

   $web_driver->execute_script($js, 'https://www.nytimes.com/vi-assets/static-assets/vendor-454814a0340940dc9b42.js');

   my $exit_form_node;
   while (1) {
      if (has_js_failed($web_driver)) {
         die("JavaScript error detected.\n");
      }

      $exit_form_node = nf_find_element($web_driver, '/html/body/form[@id="exit"]')
         and last;

      sleep(0.250);
   }

   my $code = nf_find_child_element($web_driver, $exit_form_node, 'input[@name="code"]')->get_value();
   my $msg  = nf_find_child_element($web_driver, $exit_form_node, 'input[@name="msg"]')->get_value();

   if (!defined($code) || $code ne 'success') {
      $msg ||= "Unknown error";
      die("$msg\n");
   }

   my $doc = decode_base64($msg);

   binmode STDOUT;
   print $doc;
}

Might want to add a timeout on the polling loop so it doesn't wait forever if something goes wrong.

like image 130
ikegami Avatar answered Dec 30 '25 23:12

ikegami


OK, here's some code to show how to do this with WMC. Thanks again to @ikegami for the idea and code to show how this can be done with javascript using Selenimum upon which this code is based. It's an ingenious little workaround.

The code below modifies his js code example a bit to allow for multiple files and it adds an element that becomes visible so $mech can detect when the data is ready for it to grab and save.

Step 0: Add in needed Perl packages

use MIME::Base64;
use WWW::Mechanize::Chrome;
# etc.

Step 1: Create a sub in Perl to output the js functions:

sub js_here {
return <<'JS'
var array_buffer_to_base64 = function(buf) {
         let binary = '';
         let bytes = new Uint8Array(buf);
         for (let byte of bytes) {
            binary += String.fromCharCode(byte);
         }

         return btoa(binary);
      };


      var set_response = function(code, msg, number) {
         let code_node = document.createElement('input');
         code_node.setAttribute('type', 'hidden');
         code_node.setAttribute('id', 'code-' + number);
         code_node.setAttribute('value', code);

         let msg_node = document.createElement('input');
         msg_node.setAttribute('type', 'hidden');
         msg_node.setAttribute('id', 'msg-' + number);
         msg_node.setAttribute('value', msg);

         let vis_node = document.createElement('span');
         vis_node.setAttribute('id', 'vis-' + number);
         vis_node.setAttribute('value', '');

         let form_node = document.createElement('form');
         form_node.setAttribute('id', 'exit-' + number);
         form_node.appendChild(code_node);
         form_node.appendChild(msg_node);
         form_node.appendChild(vis_node);

         document.body.appendChild(form_node);
      }

      var request = function(url, number) {
         fetch(url)
            .then(
               response => {
                  if (!response.ok)
                     throw new Error("HTTP error: " + response.status);

                  return response.arrayBuffer();
               }
            )
            .then(
               buffer => set_response("success", array_buffer_to_base64(buffer), number),
               reason => set_response("error",   reason),
            );
      };
JS
}

Step 2: Inject the code into the web page already loaded by mech with something like this:

$mech->eval_in_page(js_here());

Step 3: Create a caller Perl subroutine which will call the js inserted in step 1.

sub js_download {
  my ($url, $number) = @_;
  return "request('$url', $number)";
}

Note that this take two arguments. The url of the file and an arbitrary number identifying the file.

Step 4: Add in the code to actually do the downloading and saving of the files.

Here it is in a loop for downloading multiple files:

  my $count = 1;
  foreach my $file (@files) {
    $mech->clear_js_errors;

    # throw contents of file into a hidden field on the web page
    $mech->eval_in_page( js_download($file, $count));

    # check for javascript errors
    if ($mech->js_errors) {
      warn "A javascript error encountered while fetching $file. Skipping file.\n";
      foreach my $err ( $mech->js_errors() ) {
        my $msg = $err->{message} || '';
        warn "\t" . $msg . "\n";
      }
      ++$count;
      next;
    }

    # check for download errors
    $mech->wait_until_visible(selector => "#vis-$count");
    $mech->form_id( "exit-$count" );
    my $ret_code = $mech->value("#code-$count", one => 1);
    if ( $ret_code eq 'error' ) {
      warn "Unable to download $file: \n";
      warn $mech->value("#msg-$count") . "\n";
      ++$count;
      next;
    }

    # get the file's content and save it to the directory
    my $value = $mech->value("#msg-$count", one => 1);
    my $content = decode_base64($value);
    _save_file ($content, $file);  # up to you how to implement
    $count++;
  }

That's it.

like image 32
StevieD Avatar answered Dec 31 '25 00:12

StevieD



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!