Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

I can store only a finite number of lines in a new text file

I have many different pseudo-random number generators written in C that generate an arbitrary number of pairs of random numbers (through the CLI) and store them in a (new) text file: a pair of numbers per column. I want to store 400.000.000 numbers in a text file, but when I look at the number of lines the file has, it has only 82.595.525 lines. This is the code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "../Calculos/myfunctions.c"

void outputDevRandomOpenFile (FILE * from_file, FILE * to_file, unsigned long long how_many_pairs){

    unsigned long long i = 0LL;
    int seed;

    unsigned long long max_period = 2147483648LL;

    for (i = 0LL; i < how_many_pairs; i += 1LL){

        fread (&seed, sizeof(int), 1, from_file);
        fprintf (to_file, "%.10lf ", fabs (((double) seed) / ((double) max_period)));

        fread (&seed, sizeof(int), 1, from_file);
        fprintf (to_file, "%.10lf\n", fabs (((double) seed) / ((double) max_period)));
    }
}


int main (int argc, char *argv[]){

    char * endptr;
    unsigned long long how_many_pairs = (unsigned long long) strtoull (argv[1], &endptr, 10);

    FILE * urandom = fopen ("/dev/urandom", "r");
    FILE * to_file = fopen ("generated_numbers_devrandom.txt", "w");

    outputDevRandomOpenFile (urandom, to_file, how_many_pairs);

    fclose (urandom);

    return 0;
 }

At first I suspected that there where some issue in the code (i.e. I could be choosing the wrong types of variables somewhere), but I tested it by including inside the for-loop a if (i > 165191050) printf ("%llu\n", i); (remind that I'm using a 1-D array for storing couples of numbers, not a 2-D one, so in the condition I just multiply 82595525*2) to test whether the problem was that the code was not looping 800.000.000 times, but only 165191050. When I performed the test, after i = 165191050, it just started to print out i values on the shell, so it really looped those 800.000.000 times, but when I looked the number of lines of the generated text file, there were 82595525 lines again. So I'm betting the problem is not in the code (or at least not in the types of variables I used).

I'm also getting the same results with this algorithm (this is just another different pseudo-random number generator):

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define MT_LEN 624

int mt_index;
unsigned long mt_buffer[MT_LEN];

void mt_init() {
    int i;
    for (i = 0; i < MT_LEN; i++)
        mt_buffer[i] = rand();
    mt_index = 0;
}

#define MT_IA           397
#define MT_IB           (MT_LEN - MT_IA)
#define UPPER_MASK      0x80000000
#define LOWER_MASK      0x7FFFFFFF
#define MATRIX_A        0x9908B0DF
#define TWIST(b,i,j)    ((b)[i] & UPPER_MASK) | ((b)[j] & LOWER_MASK)
#define MAGIC(s)        (((s)&1)*MATRIX_A)

unsigned long mt_random() {
    unsigned long * b = mt_buffer;
    int idx = mt_index;
    unsigned long s;
    int i;

    if (idx == MT_LEN*sizeof(unsigned long))
    {
        idx = 0;
        i = 0;
        for (; i < MT_IB; i++) {
            s = TWIST(b, i, i+1);
            b[i] = b[i + MT_IA] ^ (s >> 1) ^ MAGIC(s);
        }
        for (; i < MT_LEN-1; i++) {
            s = TWIST(b, i, i+1);
            b[i] = b[i - MT_IB] ^ (s >> 1) ^ MAGIC(s);
        }

        s = TWIST(b, MT_LEN-1, 0);
        b[MT_LEN-1] = b[MT_IA-1] ^ (s >> 1) ^ MAGIC(s);
    }
    mt_index = idx + sizeof(unsigned long);
    return *(unsigned long *)((unsigned char *)b + idx);
    /* Here there is a commented out block in MB's original program */
}

int main (int argc, char *argv[]){

    char * endptr;
    const unsigned long long how_many_pairs = (unsigned long long) strtoll (argv[1], &endptr, 10);

    unsigned long long i = 0;

    FILE * file = fopen ("generated_numbers_mt.txt", "w");

    mt_init ();

    for (i = 0LL; i < how_many_pairs; i++){
        fprintf (file, "%.10lf ", ((double) mt_random () / (double) 4294967295));
        fprintf (file, "%.10lf\n", ((double) mt_random () / (double) 4294967295));
    }

    fclose (file);

    return 0;
}

Again, it loops 800.000.000 times, but it only stores 165191050 numbers.

$ ./devrandom 400000000
$ nl generated_numbers_devrandom.txt | tail # Here I'm just asking the shell to number the lines of the text file and to print out the 10 last ones.
82595516    0.8182168589 0.0370640513
82595517    0.1133005517 0.8237414290
82595518    0.9035788113 0.6030153367
82595519    0.9192735264 0.0945496135
82595520    0.0542484536 0.7224835437
82595521    0.1827865853 0.9254508596
82595522    0.0249044443 0.1234162976
82595523    0.0371284033 0.8898798078
82595524    0.5977596357 0.9672102989
82595525    0.5523654688 0.29032228

What is going on here?

Thanks in advance.

like image 513
Daniel Muñoz Parsapoormoghadam Avatar asked Dec 14 '13 13:12

Daniel Muñoz Parsapoormoghadam


2 Answers

Each line is 26 characters long, 82595525 lines x 26 = 2147483650 bytes

If you look closer to the file created, I'm quite sure the last line is truncated and the file size is precisely 2147483647, i.e. 2^31-1.

The reason why you can't write a larger file is either due to a file system limitation but more likely due to the fact you compile a (non large file aware) 32 bit binary, with which a file can't be more than 2147483647 as it is the largest signed integer that can be used.

If that is the case and if your OS is 64 bit, the simplest fix is to set the proper compiler flags to build a 64 bit binary which won't have this limitation.

Otherwise, have a look to abasterfield workaround.

like image 86
jlliagre Avatar answered Nov 14 '22 23:11

jlliagre


Compile with CFLAGS -D_FILE_OFFSET_BITS=64 or put

#define _FILE_OFFSET_BITS 64

in your code before you include any libc headers

like image 28
abasterfield Avatar answered Nov 14 '22 21:11

abasterfield