Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to stream creation of a JSON File?

Am trying to create a JSON file from a large dump of a database query, and works when I set a LIMIT to 100000 rows being returned, but when I want all rows to be returned, it just goes to a 502 Error (The page request was canceled because it took too long to complete). Wondering if there is a way that I can streamline the creation of a JSON File in bits using php, or if there is a Library out there that will allow me to build the json file in parts?

Basically am running a .php file here to attempt to get all orders in json format from woocommerce, since the plugin I purchased "CSV Import Suite" does not work when importing orders, it just stays in the queue.

So, I decided to try and export all orders myself, but keep hitting a 502 Error Page and it never creates the .json file either, so am thinking I need a way to stream this somehow. Any help on this would be appreciated...

ini_set('memory_limit', '-1');
ini_set('max_execution_time', '-1');
set_time_limit(0);
error_reporting(E_ALL);
ob_implicit_flush(TRUE);
ob_end_flush();

global $wpdb, $root_dir;

if (!defined('ABSPATH'))
    $root_dir = dirname(__FILE__) . '/';
else
    $root_dir = ABSPATH;


$download = isset($_GET['download']);

// Allows us to use WP functions in a .php file without 404 headers!
require_once($root_dir . 'wp-config.php');
$wp->init();
$wp->parse_request();
$wp->query_posts();
$wp->register_globals();

if (empty($download))
    $wp->send_headers();

// exclude
$exclude_post_statuses = array('trash', 'wc-refunded', 'wc_cancelled');


$start_date = !empty($_GET['start_date']) ? DateTime::createFromFormat('Y-m-d', $_GET['start_date']) : '';
$end_date = !empty($_GET['end_date']) ? DateTime::createFromFormat('Y-m-d', $_GET['end_date']) : '';


$order_db = array(
    'columns' => array(
        'p' => array('ID', 'post_author', 'post_date', 'post_date_gmt', 'post_content', 'post_title', 'post_excerpt', 'post_status', 'comment_status', 'ping_status', 'post_password', 'post_name', 'to_ping', 'pinged', 'post_modified', 'post_modified_gmt', 'post_content_filtered', 'post_parent', 'guid', 'menu_order', 'post_type', 'post_mime_type', 'comment_count'),
        'pm' => array('meta_id', 'post_id', 'meta_key', 'meta_value'),
        'oi' => array('order_item_id', 'order_item_name', 'order_item_type', 'order_id'),
        'oim' => array('meta_id', 'order_item_id', 'meta_key', 'meta_value')
    )
);

$select_data = '';
$total_columns = count($order_db['columns']);
$i = 1;

foreach($order_db['columns'] as $column_key => $columns)
{
    $select_data .= implode(', ', array_map(
        function ($v, $k) { return $k . '.' . $v . ' AS ' . $k . '_' . $v; },
        $columns,
        array_fill(0, count($columns), $column_key)
    ));

    if ($i < $total_columns)
        $select_data .= ', ';

    $i++;
}

// HUGE DATABASE DUMP HERE, needs to be converted to JSON, after getting all columns of all tables...
$orders_query = $wpdb->get_results('
    SELECT ' . $select_data . '
    FROM ' . $wpdb->posts . ' AS p
    INNER JOIN ' . $wpdb->postmeta . ' AS pm ON (pm.post_id = p.ID)
    LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_items AS oi ON (oi.order_id = p.ID)
    LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_itemmeta AS oim ON (oim.order_item_id = oi.order_item_id)
    WHERE p.post_type = "shop_order"' . (!empty($exclude_post_statuses) ? ' AND p.post_status NOT IN ("' . implode('","', $exclude_post_statuses) . '")' : '') . (!empty($start_date) ? ' AND post_date >= "' . $start_date->format('Y-m-d H:i:s') . '"' : '') . (!empty($end_date) ? ' AND post_date <= "' . $end_date->format('Y-m-d H:i:s') . '"' : '') . '
    ORDER BY p.ID ASC', ARRAY_A);

$json = array();

if (!empty($orders_query))
{
    foreach($orders_query as $order_query)
    {
        if (!isset($json[$order_query['p_post_type']], $json[$order_query['p_post_type']][$order_query['p_post_name']]))
            $json[$order_query['p_post_type']][$order_query['p_post_name']] = array(
                'posts' => array(),
                'postmeta' => array(),
                'woocommerce_order_items' => array(),
                'woocommerce_order_itemmeta' => array()
            );

        if (!empty($order_query['p_ID']))
            $json[$order_query['p_post_type']][$order_query['p_post_name']]['posts'][$order_query['p_ID']] = array_filter($order_query, function($k) {
                $is_p = strpos($k, 'p_');
                return $is_p !== FALSE && empty($is_p);
            }, ARRAY_FILTER_USE_KEY);

        if (!empty($order_query['pm_meta_id']))
            $json[$order_query['p_post_type']][$order_query['p_post_name']]['postmeta'][$order_query['pm_meta_id']] = array_filter($order_query, function($k) {
                $is_pm = strpos($k, 'pm_');
                return $is_pm !== FALSE && empty($is_pm);
            }, ARRAY_FILTER_USE_KEY);

        if (!empty($order_query['oi_order_item_id']))
            $json[$order_query['p_post_type']][$order_query['p_post_name']]['woocommerce_order_items'][$order_query['oi_order_item_id']] = array_filter($order_query, function($k) {
                $is_io = strpos($k, 'oi_');
                return $is_io !== FALSE && empty($is_io);
            }, ARRAY_FILTER_USE_KEY);


        if (!empty($order_query['oim_meta_id']))
            $json[$order_query['p_post_type']][$order_query['p_post_name']]['woocommerce_order_itemmeta'][$order_query['oim_meta_id']] = array_filter($order_query, function($k) {
                $is_oim = strpos($k, 'oim_');
                return $is_oim !== FALSE && empty($is_oim);
            }, ARRAY_FILTER_USE_KEY);
    }
}

// Downloading or viewing?
if (!empty($download))
{
    // Outputs json in a textarea for you to copy and paste into a .json file for import...

    if (!empty($json))
    {
        $filename = uniqid('orders_') . '.json';

        $fp = fopen($filename, 'w');
        fwrite($fp, json_encode($json));
        fclose($fp);


        $size   = filesize($root_dir . '/' . $filename);
        header('Content-Description: File Transfer');
        header('Content-Type: application/octet-stream');
        header("Content-Disposition: attachment; filename=\"" . $filename . "\""); 
        header('Content-Transfer-Encoding: binary');
        header('Connection: Keep-Alive');
        header('Expires: 0');
        header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
        header('Pragma: public');
        header('Content-Length: ' . $size);

        readfile($root_dir . '/' . $filename);
    }

}
else
{
    // Outputs json in a textarea for you to copy and paste into a .json file for import...
    if (!empty($json))
        echo '<textarea cols="200" rows="50">', json_encode($json), '</textarea>';
}

The JSON File created could be a well over 500 MB, and possibly even up to 1 Gig of data. So, I believe PHP is running out of memory here, and needs to be processed bit by bit somehow, either in the background, or completely, without hitting the php memory limit. I believe the memory limit is set to 1024 MB, which is pretty high, but not high enough and tbh, for what I'm doing, I don't think we can ever have enough memory to perform the operation as is. Something needs to change in how I process the json and/or download it. And I do not want to create multiple json files, please only 1 JSON file.

like image 388
Solomon Closson Avatar asked Jan 31 '18 06:01

Solomon Closson


People also ask

Can JSON be streamed?

JSON streaming comprises communications protocols to delimit JSON objects built upon lower-level stream-oriented protocols (such as TCP), that ensures individual JSON objects are recognized, when the server and clients use the same one (e.g. implicitly coded in).

Can a JSON file be a video?

Of course you could send video-Data in JSON, but since JSON is essentially a String-Format with several reserved Characters you will have to encode the video-data. You could use any scheme you like - but BASE64 is a widely accepted Standard so I would use it.


1 Answers

I think there might be couple of issues. Firstly I would suggest you do some profiling.

    // HUGE DATABASE DUMP HERE, needs to be converted to JSON, after getting all columns of all tables...
    echo 'Start Time: '. date("Y-m-d H:i:s");
    echo ' Memory Usage: ' . (memory_get_usage()/1048576) . ' MB \n';

    $orders_query = $wpdb->get_results('
        SELECT ' . $select_data . '
        FROM ' . $wpdb->posts . ' AS p
        INNER JOIN ' . $wpdb->postmeta . ' AS pm ON (pm.post_id = p.ID)
        LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_items AS oi ON (oi.order_id = p.ID)
        LEFT JOIN ' . $wpdb->prefix . 'woocommerce_order_itemmeta AS oim ON (oim.order_item_id = oi.order_item_id)
        WHERE p.post_type = "shop_order"' . (!empty($exclude_post_statuses) ? ' AND p.post_status NOT IN ("' . implode('","', $exclude_post_statuses) . '")' : '') . (!empty($start_date) ? ' AND post_date >= "' . $start_date->format('Y-m-d H:i:s') . '"' : '') . (!empty($end_date) ? ' AND post_date <= "' . $end_date->format('Y-m-d H:i:s') . '"' : '') . '
        ORDER BY p.ID ASC', ARRAY_A);

    echo 'End Time: '. date("Y-m-d H:i:s");
    echo ' Memory Usage: ' . (memory_get_usage()/1048576) . ' MB \n';
    die('Finished');

    $json = array();

The above will help you to know how much memory is in use, upto this point. If it fails before it prints 'Finished', we know it is not a json issue. If the script works fine then we can first create a csv file rather json. Since you are running a select query, (at this point) it does not have to be nested json file which you require. A flat structure can be achieved by just creating a CSV file.

$csvFile = uniqid('orders') . '.csv';
$fp = fopen($csvFile, 'w');
if (!empty($orders_query))
{
    $firstRow = true;
    foreach($orders_query as $order_query)
    {
        if(true === $firstRow) {
            $keys = array_keys($order_query);
            fputcsv($fp, $order_query);
            $firstRow = false;
        }

        fputcsv($fp, $order_query);
    }
}
fclose($fp);

If the above works fine you at-least have a csv file to work with.

At this point I am not sure how complex is your data structure nested. For e.g how many distinct values exist for 'p_post_type' and 'p_post_name' you are having. You might require to parse the csv file and create multiple json file for each ['p_post_type']['p_post_name']['posts'], ['p_post_type']['p_post_name']['posts'], ['p_post_type']['p_post_name']['woocommerce_order_items'] and ['p_post_type']['p_post_name']['woocommerce_order_itemmeta'].

If the number of files are few you can write a script to merge them automatically or do them manually. If you have too many nested items, the number of json files that might be created might be a lot and might be hard to merge them and might not be a feasible option.

If the number of json files are lot, I would like to know what is the purpose of having such a huge single json file. If export is an issue import might be an issue too, especially ingesting such a huge json file in memory. I believe if the purpose of creating the json file is to import it in some form, at some stage in future, I think you might have to look at the option of just having a csv file instead, which you use to filter out whatever is required at that point of time.

I hope this helps.

FURTHER UPDATE

It looks to me that $wpdb->get_results is using mysqli_query/mysql_query (depending on your configuration) to fetch the results. See wordpress query docs. It is not memory efficient way to fetch the data this way. I believe you might be failing at this point ($wpdb->get_results) itself. I would suggest you to run the query without using $wpdb. There is a concept of unbuffered query whenever large data retrieval is required, which has very low impact on the memory. Further information can be found here mysql unbuffering.

Even if you get past this point, you will still run into memory issues, due to the way how you are storing everything in $json variable which is eating up lot of your memory. $json is an array and it would interesting to know how PHP array works. PHP arrays are dynamic and they do not allocate extra memory every time a new element is added, since that would be extremely slow. It instead, increases the array size to the power of two, which means whenever the limit is exhausted it increases the array limit to twice its current limit and in the process tries to increase the memory to twice the limit. This has been less of an issue with PHP 7, since they have made some major changes to the php core. So if you have 2GB data that might be required to be stored in $json, the script might easily allocate anywhere between 3-4 GB memory, depending upon when it hits the limit. Further details can be found here php array and How does PHP memory actually work

If you consider the overhead of the $orders_query which is an array combined with overhead of $json it is quite substantial due to the way PHP array works.

You can also try to create another database B. So while you are reading from database A, you simultaneously start writing data to database B. In the end you have database B with all the data in it with the power of MySQL. You could also push the same data into a MongoDB which would be lightning fast and might help you with the json nesting you are after. MongoDB is meant to work really efficiently with large datasets.

JSON STREAMING SOLUTION

Firstly, I would like to say that streaming is sequential/linear process. As such, it is does not have memory of what was added before this point of time or what will added after this point of time. It works in small chunks and that is the reason it is so memory efficient. So when you actually write or read, the responsibility lies with the script, that it maintains a specific order, which is kind of saying you are writing/reading your own json, as streaming only understands text and has no clue about what json is and won't bother itself in writing/reading a correct one.

I have found a library on github https://github.com/skolodyazhnyy/json-stream which would help in you achieving what you want. I have experimented with the code and I can see it will work for you with some tweaks in your code.

I am going to write some pseudo-code for you.

//order is important in this query as streaming would require to maintain a proper order.
$query1 = select distinct p_post_type from ...YOUR QUERY... order by p_post_type;
$result1 = based on $query1; 

$filename = 'data.json';
$fh = fopen($filename, "w");
$writer = new Writer($fh);
$writer->enter(Writer::TYPE_OBJECT);  

foreach($result1 as $fields1) {
    $posttype = $fields1['p_post_type'];
    $writer->enter($posttype, Writer::TYPE_ARRAY); 

    $query2 = select distinct p_post_name from ...YOUR QUERY... YOUR WHERE ... and p_post_type= $posttype order by p_post_type,p_post_name;
    $result2 = based on $query2;

    foreach($result2 as $fields2) {
        $postname = $fields1['p_post_name'];
        $writer->enter($postname, Writer::TYPE_ARRAY); 

        $query3 = select ..YOUR COLUMNS.. from ...YOUR QUERY... YOUR WHERE ... and p_post_type= $posttype and p_post_name=$postname where p_ID is not null order by p_ID;
        $result3 = based on $query3;
        foreach($result2 as $field3) {
            $writer->enter('posts', Writer::TYPE_ARRAY); 
            // write an array item
            $writer->write(null, $field3);
        }
        $writer->leave(); 

        $query4 = select ..YOUR COLUMNS.. from ...YOUR QUERY... YOUR WHERE ... and p_post_type= $posttype and p_post_name=$postname where pm_meta_id is not null order by pm_meta_id;
        $result4 = based on $query4;
        foreach($result4 as $field4) {
            $writer->enter('postmeta', Writer::TYPE_ARRAY); 
            // write an array item
            $writer->write(null, $field4);
        }
       $writer->leave(); 

        $query5 = select ..YOUR COLUMNS.. from ...YOUR QUERY... YOUR WHERE ... and p_post_type= $posttype and p_post_name=$postname where oi_order_item_id is not null order by oi_order_item_id;
        $result5 = based on $query5;
        foreach($result5 as $field5) {
            $writer->enter('woocommerce_order_items', Writer::TYPE_ARRAY); 
            // write an array item
            $writer->write(null, $field5);
        }
        $writer->leave(); 

        $query6 = select ..YOUR COLUMNS.. from ...YOUR QUERY... YOUR WHERE ... and p_post_type= $posttype and p_post_name=$postname where oim_meta_id is not null order by oim_meta_id;
        $result6 = based on $query6;
        foreach($result6 as $field6) {
            $writer->enter('woocommerce_order_itemmeta', Writer::TYPE_ARRAY); 
            // write an array item
            $writer->write(null, $field5);
        }
        $writer->leave(); 

    }
$writer->leave(); 
fclose($fh);

You might have to start limiting your queries to 10 something until you get it right. Since the code above might not just work as it is. You should be able to read the code in similar fashion as the same library has got a Reader class to help. I have tested both reader and writer and they seem to work fine.

like image 178
Kamal Soni Avatar answered Oct 15 '22 14:10

Kamal Soni