Given a file containing two columns of integers, I want to get rid of the gaps between the integer values. By gap I mean that if we take two integers A and B, in a way that there is no C such as A
1 2
1 3
2 5
6 9
3 5
7 9
11 6
7 11
to this:
1 2
1 3
2 4
5 7
3 4
6 7
8 5
6 8
In the first two columns, the present integers are {1,2,3,5,6,7,9,11}. The missing values are {4,8,10}. the goal is to decrease every integer by the number of missing values that are smaller than it. so 5,6 and 7 are decreased by 1, 9 us decreased by 2, and 11 is decreased by 3. so the values {1,2,3,5,6,7,9,11} are replaced by {1,2,3,4,5,6,7,8}. does anyone know how to do it efficiently, using a linux command, a bash script or and awk command? Thank you!
Edit: I tried to do it but I didn't find a way to do it in a shell script, I had to write a c program which executes shell scripts. the first part just sorts the file, the second, does what I talked about in the question.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define MAX_INTS 100000000
void process_file(char *path){
//FIRST PART
char *outfpath="tmpfile";
char *command=calloc(456+3*strlen(path)+strlen(outfpath),sizeof(char));
sprintf(command,"#!/bin/bash \nvar1=$( cat %s | head -n 4 && ( cat %s | tail -n +5 | awk '{split( $0, a, \" \" ); asort( a ); for( i = 1; i <= length(a); i++ ) printf( \"%c%c \", a[i] ); printf( \"\\n\" ); }' | sort -n -k1,1 -k2 | uniq) )\nvar2=$( ( (echo \"$var1\" | tail -n +5 | cut -f 1 -d\" \") && (echo \"$var1\" | tail -n +5 | cut -f 2 -d\" \" ) ) | sort -n -k1,1 | uniq | awk '{for(i=p+1; i<$1; i++) print i} {p=$1}' )\necho \"$var1\" > %s\necho \"$var2\"| tr \"\\n\" \" \" > %s",path,path,'%','s',path,outfpath);
if(system(command)==-1){
fprintf(stderr,"Erreur à l'exécution de la commande \n%s\n",command);
}
//the first part only sorts the file and puts in outpath the list of the missing integers
//SECOND PART
long unsigned start=0,end=0,val,index=0;
long unsigned *intvals=calloc(MAX_INTS,sizeof(long unsigned));
FILE *f=fopen(outfpath,"r");
//reads the files and loads the missing ints to the array intvals
while(fscanf(f,"%lu ",&val)==1){
end=index;
intvals[index]=val;
index++;
}
if (index==0) return;
intvals=realloc(intvals,index*sizeof(long unsigned));
fclose(f);
free(command);
f=fopen(path,"r+w");
char *line=calloc(1000,sizeof(char));
command=calloc(1000,sizeof(char));
char *str;
long unsigned v1,v2,
d1=0,d2=0,
c=0,prec=-1,start_l=0;
int pos1, pos2;
//read a file containing two columns of ints
//for each pair v1 v2, count d1 d2,
//such as d1 is the number of missing values smaller than v1, d2 the number of missing values smaller than v2
//and overrwrite the line in the file using sed with the values v1-d1 and v2-d2
while(fgets(line,1000,f)!=NULL && line[0]=='#'){ continue; }
do{
str=strtok(line," \t");
v1=atoi(str);
str=strtok(NULL," \t");
v2=atoi(str);
if(prec!=v1) {
prec=v1;
d2=d1;
start_l=start;
}
for(index=start;index<=end;index++){
if(intvals[index]<v1){
d1++;
start++;
c=1;
}else{
start=d1;
break;
}
}
for(index=start_l;index<=end;index++){
if(intvals[index]<v2){
d2++;
start_l++;
c=1;
}else{
break;
}
}
if(c){
sprintf(command,"sed -i 's/%lu %lu/%lu %lu/' %s",v1,v2,v1-d1,v2-d2,path);
if(system(command)==-1){
fprintf(stderr,"Erreur à l'exécution de la commande \n%s\n",command);
}
}
c=0;
}while(fgets(line,1000,f)!=NULL);
fclose(f);
free(command);
free(line);
free(intvals);
}
int main(int argc,char* argv[]){
process_file(argv[1]);
return 0;
}
This might do it:
awk '(NR==FNR){for(i=1;i<=NF;++i) {a[$i]; max=(max<$i?$i:max)};next}
(FNR==1) {for(i=1;i<=max;++i) if(i in a) a[i]=++c }
{for(i=1;i<=NF;++i) $i=a[$i]}1' file file
If file
has as input:
1 2
1 3
2 5
6 9
3 5
7 9
11 6
7 11
The above command will return:
1 2
1 3
2 4
5 7
3 4
6 7
8 5
6 8
The idea of this method is to keep track of an array a
which is indexed by the old value and returns the new value : a[old]=new
. We scan the file twice and store all possible values in a[old]
. When we read the file for the second time, we first check what the new values are going to be. When that is done, we just update all the fields with the new values and print the result.
The above can also be done by reading the file a single time, you just need to buffer a bit:
awk '{b[FNR]=$0;for(i=1;i<=NF;++i) {a[$i]; max=(max<$i?$i:max)}}
END {
for(i=1;i<=max;++i) if(i in a) a[i]=++c
for(n=1;n<=FNR;++n) {
$0=b[n]
for(i=1;i<=NF;++i) $i=a[$i]
print
}
}' file
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With