for the past weeks i've been working on creating a program that should work like the 'du' command on linux with these flags:
du -l -s -B512
I've managed to get the general of the program to work on smaller files but when it get's tested it for some reason fails a test on a large directory using 1 thread and I cannot figure it out.
Using 1 argument /usr/lib, I get this output:
7436496 /usr/lib
whereas I expected this:
7437280 /usr/lib
I have tried debugging and recreating this error by running tests on my machine but I always get the same results as the 'du' command as seen here:
$ ./mdu -j1
704664 lib
vs:
$ du -s -1
704664 lib
Code in mdu.c:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <stdbool.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <errno.h>
#include <fcntl.h>
#include <dirent.h>
#include "queue.h"
#include <sys/types.h>
#include <sys/statvfs.h>
void parse_options(int argc, char** argv, int* jflag, int* num_threads, char*** files, int* num_files);
int list_files(const char* path, size_t* totalSize);
void* thread_function(void* arg);
int process_directory(const char* path, size_t* totalSize, pthread_mutex_t* sizeMutex, Queue* workQueue);
int process_directory_queue(const char* path, size_t* totalSize);
int accumulate_file_size(const char* path, size_t* totalSize);
int handle_directory_entry(const char* path, const struct dirent* entry, size_t* totalSize, pthread_mutex_t* sizeMutex, Queue* workQueue);
int add_directory_size(const char* path, size_t* totalSize, pthread_mutex_t* sizeMutex);
int handle_subdirectory(const char* nextPath, size_t* totalSize, pthread_mutex_t* sizeMutex, Queue* workQueue);
int handle_regular_file(const struct stat* st, size_t* totalSize, pthread_mutex_t* sizeMutex);
typedef struct ThreadData {
Queue* workQueue;
pthread_mutex_t* sizeMutex;
size_t* totalSize;
} ThreadData;
int main(int argc, char** argv)
{
int jflag = 0;
int num_threads = 1;
char** files;
int num_files;
ThreadData data;
Queue workQueue;
int error_occurred = 0;
parse_options(argc, argv, &jflag, &num_threads, &files, &num_files);
pthread_mutex_t sizeMutex = PTHREAD_MUTEX_INITIALIZER;
data.sizeMutex = &sizeMutex;
data.workQueue = &workQueue;
initialize_queue(&workQueue);
for (int i = 0; i < num_files; i++) {
size_t totalsize = 0;
data.totalSize = &totalsize;
enqueue_item(&workQueue, files[i]);
if (jflag == 0 || num_threads == 1) {
error_occurred |= list_files(files[i], &totalsize);
} else {
pthread_t threads[num_threads];
int ret;
for (int j = 0; j < num_threads; ++j) {
ret = pthread_create(&threads[j], NULL, thread_function, &data);
if (ret != 0) {
fprintf(stderr, "Error: Cannot create thread %d\n", j);
exit(EXIT_FAILURE);
}
}
for (int j = 0; j < num_threads; ++j) {
ret = pthread_join(threads[j], NULL);
if (ret != 0) {
fprintf(stderr, "Error: Cannot join thread %d\n", j);
exit(EXIT_FAILURE);
}
}
}
printf("%ld\t%s\n", totalsize/2, files[i]);
}
cleanup_queue(&workQueue);
pthread_mutex_destroy(&sizeMutex);
if (error_occurred) {
return 1;
}
return 0;
}
int list_files(const char* path, size_t* totalSize) {
struct stat st;
if (lstat(path, &st) == 0) {
if (S_ISDIR(st.st_mode)) {
return process_directory_queue(path, totalSize);
} else if (S_ISREG(st.st_mode)) {
return accumulate_file_size(path, totalSize);
} else {
fprintf(stderr, "Unsupported file type: %s\n", path);
return 1;
}
} else {
perror("lstat");
return 1;
}
}
int process_directory_queue(const char* path, size_t* totalSize) {
pthread_mutex_t fakeMutex = PTHREAD_MUTEX_INITIALIZER;
Queue fakeQueue;
initialize_queue(&fakeQueue);
enqueue_item(&fakeQueue, path);
char nextPath[4096];
while (true) {
bool dequeued = dequeue_item(&fakeQueue, nextPath);
if (!dequeued) {
break;
}
int error = process_directory(nextPath, totalSize, &fakeMutex, &fakeQueue);
if (error) {
cleanup_queue(&fakeQueue);
pthread_mutex_destroy(&fakeMutex);
return error;
}
}
cleanup_queue(&fakeQueue);
pthread_mutex_destroy(&fakeMutex);
return 0;
}
int accumulate_file_size(const char* path, size_t* totalSize) {
struct stat st;
if (lstat(path, &st) == 0) {
unsigned long file_blocks = st.st_blocks;
*totalSize += file_blocks;
return 0;
} else {
perror("lstat file");
return 1;
}
}
void* thread_function(void* arg)
{
ThreadData* data = (ThreadData*)arg;
Queue* workQueue = data->workQueue;
pthread_mutex_t* sizeMutex = data->sizeMutex;
size_t* totalSize = data->totalSize;
char path[4096];
while (true) {
bool success = dequeue_item(workQueue, path);
if (!success) {
break;
}
process_directory(path, totalSize, sizeMutex, workQueue);
}
return NULL;
}
int process_directory(const char* path, size_t* totalSize, pthread_mutex_t* sizeMutex, Queue* workQueue) {
int error_occurred = 0;
DIR* dp = opendir(path);
if (dp == NULL) {
perror("Unable to open directory");
return 1;
}
struct dirent* entry;
while ((entry = readdir(dp)) != NULL) {
if (handle_directory_entry(path, entry, totalSize, sizeMutex, workQueue) != 0) {
error_occurred = 1;
}
}
if (add_directory_size(path, totalSize, sizeMutex) != 0) {
error_occurred = 1;
}
closedir(dp);
return error_occurred;
}
int handle_directory_entry(const char* path, const struct dirent* entry, size_t* totalSize, pthread_mutex_t* sizeMutex, Queue* workQueue) {
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
return 0;
}
char nextPath[4096];
snprintf(nextPath, sizeof(nextPath), "%s/%s", path, entry->d_name);
struct stat st;
if (lstat(nextPath, &st) == 0) {
if (S_ISDIR(st.st_mode)) {
return handle_subdirectory(nextPath, totalSize, sizeMutex, workQueue);
} else if (S_ISREG(st.st_mode)) {
return handle_regular_file(&st, totalSize, sizeMutex);
}
} else {
if (errno == EACCES) {
fprintf(stderr, "du: cannot read directory '%s': Permission denied\n", nextPath);
return 1;
} else {
perror("lstat");
return 1;
}
}
return 0;
}
int handle_subdirectory(const char* nextPath, size_t* totalSize, pthread_mutex_t* sizeMutex, Queue* workQueue) {
DIR* sub_dp = opendir(nextPath);
if (sub_dp == NULL) {
if (errno == EACCES) {
fprintf(stderr, "du: cannot read directory '%s': Permission denied\n", nextPath);
add_directory_size(nextPath, totalSize, sizeMutex); // Add the size even if access is denied.
return 1; // Error occurred
} else {
perror("Unable to open sub-directory");
return 1; // Error occurred
}
} else {
closedir(sub_dp);
enqueue_item(workQueue, nextPath);
return 0; // Success
}
}
int handle_regular_file(const struct stat* st, size_t* totalSize, pthread_mutex_t* sizeMutex) {
unsigned long file_blocks = st->st_blocks;
pthread_mutex_lock(sizeMutex);
*totalSize += file_blocks;
pthread_mutex_unlock(sizeMutex);
return 0; // Success
}
int add_directory_size(const char* path, size_t* totalSize, pthread_mutex_t* sizeMutex) {
struct stat st_dir;
if (lstat(path, &st_dir) == 0) {
unsigned long dir_blocks = st_dir.st_blocks;
pthread_mutex_lock(sizeMutex);
*totalSize += dir_blocks;
pthread_mutex_unlock(sizeMutex);
return 0; // Success
} else {
perror("lstat directory");
return 1; // Error occurred
}
}
void parse_options(int argc, char** argv, int* jflag, int* num_threads, char*** files, int* num_files) {
int c;
while ((c = getopt(argc, argv, "j:")) != -1) {
switch (c) {
case 'j':
*jflag = 1;
*num_threads = atoi(optarg);
if (*num_threads <= 0) {
fprintf(stderr, "-j has to be positive.\n");
exit(EXIT_FAILURE);
}
break;
case '?':
if (isprint(optopt)) {
fprintf(stderr, "Unknown option '-%c'.\n", optopt);
}
exit(EXIT_FAILURE);
default:
exit(EXIT_FAILURE);
}
}
*num_files = argc - optind;
*files = argv + optind;
}
This is the queue.c:
#include "queue.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void initialize_queue(Queue* queue)
{
queue->head = NULL;
queue->tail = NULL;
pthread_mutex_init(&(queue->mutex), NULL);
pthread_cond_init(&(queue->cond_var), NULL);
}
void enqueue_item(Queue* queue, const char* path)
{
Item* item = (Item*) malloc(sizeof(Item));
if (item == NULL) {
perror("Malloc Error: ");
exit(EXIT_FAILURE);
}
strncpy(item->path, path, sizeof(item->path) - 1);
item->next = NULL;
pthread_mutex_lock(&(queue->mutex));
if (queue->tail == NULL) {
queue->head = item;
queue->tail = item;
} else {
queue->tail->next = item;
queue->tail = item;
}
pthread_cond_signal(&(queue->cond_var));
pthread_mutex_unlock(&(queue->mutex));
}
bool dequeue_item(Queue* queue, char* path) {
pthread_mutex_lock(&(queue->mutex));
if (queue->head == NULL) {
pthread_mutex_unlock(&(queue->mutex));
return false;
}
Item* item = queue->head;
queue->head = item->next;
if (queue->head == NULL) {
queue->tail = NULL;
}
strncpy(path, item->path, 4096);
free(item);
pthread_mutex_unlock(&(queue->mutex));
return true;
}
void cleanup_queue(Queue* queue) {
pthread_mutex_destroy(&(queue->mutex));
pthread_cond_destroy(&(queue->cond_var));
while (queue->head != NULL) {
Item* item = queue->head;
queue->head = item->next;
free(item);
}
}
Any help is greatly appreciated thanks.
EDIT: queue.h:
#ifndef QUEUE_H
#define QUEUE_H
#include <pthread.h>
#include <stdbool.h>
typedef struct Item {
char path[4096];
struct Item* next;
} Item;
typedef struct Queue {
Item* head;
Item* tail;
pthread_mutex_t mutex;
pthread_cond_t cond_var;
} Queue;
void initialize_queue(Queue* queue);
void enqueue_item(Queue* queue, const char* path);
bool dequeue_item(Queue* queue, char* path);
void cleanup_queue(Queue* queue);
#endif // QUEUE_H
EDIT 2:
This is the feedback I have gotten:
busy-waiting on in the while true loop, a condition variable is required.
Could this cause this issue?
EDIT 3
After suspecting the symbolic links were the issue these functions were changed/implemented to handle them:
int handle_symbolic_link(const char* path, ThreadData* data) {
struct stat st;
if (lstat(path, &st) == 0) {
pthread_mutex_lock(data->sizeMutex);
*(data->totalSize) += st.st_size;
pthread_mutex_unlock(data->sizeMutex);
return 0;
} else {
perror("lstat symbolic link");
return 1;
}
}
Change in handle_directory_entry
int handle_directory_entry(const char* path, const struct dirent* entry, ThreadData* data) {
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) {
return 0;
}
char nextPath[4096];
snprintf(nextPath, sizeof(nextPath), "%s/%s", path, entry->d_name);
struct stat st;
if (lstat(nextPath, &st) == 0) {
if (S_ISDIR(st.st_mode)) {
return handle_subdirectory(nextPath, data);
}
else if (S_ISREG(st.st_mode)) {
return handle_regular_file(&st, data);
}
else if (S_ISLNK(st.st_mode)) {
return handle_symbolic_link(nextPath,data);
}
}
else {
if (errno == EACCES) {
fprintf(stderr, "du: cannot read directory '%s': Permission denied\n", nextPath);
return 1;
}
else {
perror("lstat");
return 1;
}
}
return 0;
}
After testing these are the new reports:
7475809 /usr/lib
whereas I expected this:
7437280 /usr/lib
I am also failing this tests the explicitly checks for symbolic link as seen here
TEST - Symbolic link [FAILED]
Check that the program does not follow symbolic links: ./mdu a
mkdir a
fallocate -l 65537 a/a
ln -s ../b a/b
mkdir b
fallocate -l 94847 b/a
fallocate -l 6701 b/b
Program output:
156 a
Expected output:
152 a
This leads me to believe that either I am not supposed to handle symbolic links or the issue lies within hard links.
/usr/lib contains many symbolic links, such as:
libbfd-2.25-system.so
libbind9.so.90@
libbind9.so.90.0.9
libcdt.so.5@
libcdt.so.5.0.0
libcgraph.so.6@
libcgraph.so.6.0.0
libdiscover.so.2@
libdiscover.so.2.0.1
libdns.so.100@
libdns.so.100.2.2
for which you should accumulate the size as they are not necessarily stored inside the inode.
Another more difficult problem is the du command only counts files and directories once even in presence of multiple hard links pointing to the same inode. For example, on my macOS laptop, the subdirectory /usr/lib/usd/usd/usdShaders/resources/shaders has this contents:
$ ls -lis /usr/lib/usd/usd/usdShaders/resources/shaders
total 48
1152921500312436903 16 -rw-r--r-- 1 root wheel 14922 May 13 00:29 previewSurface.glslfx
1152921500312277175 8 -rw-r--r-- 6 root wheel 1223 May 13 00:29 primvarReader.glslfx
1152921500312436906 8 -rw-r--r-- 1 root wheel 13867 May 13 00:29 shaderDefs.usda
1152921500312277250 8 -rw-r--r-- 2 root wheel 1223 May 13 00:29 transform2d.glslfx
1152921500312277175 8 -rw-r--r-- 6 root wheel 1223 May 13 00:29 uvTexture.glslfx
So primvarReader.glslfx and uvTexture.glslfx point to the same inode (which has 4 other paths pointing to it), and du only counts it once:
$ du -a /usr/lib/usd/usd/usdShaders/resources/shaders
16 /usr/lib/usd/usd/usdShaders/resources/shaders/previewSurface.glslfx
8 /usr/lib/usd/usd/usdShaders/resources/shaders/primvarReader.glslfx
8 /usr/lib/usd/usd/usdShaders/resources/shaders/shaderDefs.usda
8 /usr/lib/usd/usd/usdShaders/resources/shaders/transform2d.glslfx
40 /usr/lib/usd/usd/usdShaders/resources/shaders
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With