Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Batch metadata requests for files

It basically boils down to: if I have 4000 files in a directory, the File.isDirectory() function takes 1ms to execute, so the directory takes 4s to compute (too slow [ 1 ]).

I haven't got the most complete knowledge of the filesystem, but I think that isDirectory() can be batched for all the elements in the directory (reading a chunk of data, and then separating the individual file's metadatas). C/C++ code is acceptable (it can be run with the JNI) but should be left as a last resource.

I have found FileVisitor, but it doesn't seem that it is the best solution to my problem, as I don't have to visit the entire file tree. I also found BasicFileAttributeView but it seems it has the same problem. This is a related question but there aren't answers that provide a significant solution.

[ 1 ]: Because it is not the only thing I do it ends up being like 17s.

Edit: Code:

internal fun workFrom(unit: ProcessUnit<D>) {
    launch {
        var somethingAddedToPreload = false
        val file = File(unit.first)

        ....

        //Load children folders
        file.listFiles(FileFilter {
            it.isDirectory
        })?.forEach {
            getPreloadMapMutex().withLock {
                if (getPreloadMap()[it.path] == null) {
                    val subfiles = it.list() ?: arrayOf()
                    for (filename in subfiles) {
                        addToProcess(it.path, ProcessUnit(it.path + DIVIDER + filename, unit.second))
                    }

                    getPreloadMap()[it.path] = PreloadedFolder(subfiles.size)
                    if (getPreloadMap().size > PRELOADED_MAP_MAXIMUM) cleanOldEntries()
                    getDeleteQueue().add(it.path)

                    somethingAddedToPreload = somethingAddedToPreload || subfiles.isNotEmpty()
                }
            }
        }

        ...

        if(somethingAddedToPreload) {
            work()
        }
    }
}

private fun addToProcess(path: String, unit: ProcessUnit<D>) {
    val f: () -> Pair<String, FetcherFunction<D>> = { load(path, unit) }
    preloadList.add(f)
}

private suspend fun work() {
    preloadListMutex.withLock {
        preloadList.forEach {
            launch {
                val (path, data) = it.invoke()

                if (FilePreloader.DEBUG) {
                    Log.d("FilePreloader.Processor", "Loading from $path: $data")
                }

                val list = getPreloadMap()[path]
                        ?: throw IllegalStateException("A list has been deleted before elements were added. We are VERY out of memory!")
                list.add(data)
            }
        }
        preloadList.clear()
    }
}

PS: I will remove the coroutines in work before implementing an optimization, complete code is here.

like image 904
EmmanuelMess Avatar asked Mar 25 '18 19:03

EmmanuelMess


2 Answers

You could run a ls -F and check in the output if the file is a directory by looking at the last character, directories will end with /. E.g.

val cmd = "ls -F ${myFile.absolutePath}"
val process = Runtime.getRuntime().exec(cmd)
val files = process.inputStream
        .bufferedReader()
        .use(BufferedReader::readText)
        .lines()

for (fileName in files) {
    val isDir = fileName.endsWith("/")
}

I run a quick test on an emulator, with 4000 files and 4000 directories it's taking ~150ms for the whole thing.

like image 152
lelloman Avatar answered Nov 03 '22 19:11

lelloman


Years ago, I had to write a JNI interface to opendir()/readdir()/closedir()/rewinddir() to address a similar performance issue. It's a bit of a hack, as it uses a jlong to hold a DIR * pointer from opendir() and pass it to subsequent readdir() and closedir() calls, but it was probably several orders of magnitude faster than Java's listFiles() on large directories.

It does require a JNI library, but you may find it useful:

/*
 * Class:     path_to_jni_ReadDir
 * Method:    opendir
 * Signature: (Ljava/lang/String;)J
 */
JNIEXPORT jlong JNICALL Java_path_to_jni_ReadDir_opendir
  (JNIEnv *env, jclass cl, jstring jdirname )
{
    const char *cdirname;
    jboolean copy;

    jlong dirp;

    if ( NULL == jdirname )
    {
        return( ( jlong ) NULL );
    }

    cdirname= ( env )->GetStringUTFChars( jdirname , &copy );
    if ( NULL == cdirname )
    {
        return( ( jlong ) NULL );
    }

    if ( 0 == ::strlen( cdirname ) )
    {
        ( env )->ReleaseStringUTFChars( jdirname , cdirname );
        return( ( jlong ) NULL );
    }

    dirp = ( jlong ) ::opendir( cdirname );

    ( env )->ReleaseStringUTFChars( jdirname , cdirname );

    return( dirp );
}

/*
 * Class:     path_to_jni_ReadDir
 * Method:    readdir
 * Signature: (J)Ljava/lang/String;
 */
JNIEXPORT jstring JNICALL Java_path_to_jni_ReadDir_readdir
  (JNIEnv *env, jclass cl, jlong dirp )
{
    struct dirent *dentp;
    struct dirent *dentbuffer;
    char buffer[ 8192 ];

    jstring jfilename;

    int rc;

    dentbuffer = (  struct dirent * ) buffer;
    dentp = NULL;

    rc = ::readdir_r( ( DIR * ) dirp, dentbuffer, &dentp );
    if ( ( SUCCESS != rc ) || ( NULL == dentp ) )
    {
        return( NULL );
    }

    jfilename = env->newStringUTF( dentp->d_name );

    return( jfilename );
}

/*
 * Class:     path_to_jni_ReadDir
 * Method:    closedir
 * Signature: (J)I
 */
JNIEXPORT jint JNICALL Java_path_to_jni_ReadDir_closedir
  (JNIEnv *env, jclass cl, jlong dirp )
{
    jint rc;

    rc = ::closedir( ( DIR * ) dirp );

    return( rc );
}

/*
 * Class:     path_to_jni_ReadDir
 * Method:    rewinddir
 * Signature: (J)V
 */
JNIEXPORT void JNICALL Java_path_to_jni_ReadDir_rewinddir
  (JNIEnv *env, jclass cl, jlong dirp )
{
    ::rewinddir( ( DIR * ) dirp );

    return;
}

I've removed customer-identifying information from the code, so it's not exactly as delivered and may have some typos.

Given the Android dirent structure is

struct dirent {
  uint64_t         d_ino;
  int64_t          d_off;
  unsigned short   d_reclen;
  unsigned char    d_type;
  char             d_name[256];
};

You can modify the JNI readdir method to add filters based on the d_type field, which contains one of the following values:

#define  DT_UNKNOWN     0
#define  DT_FIFO        1
#define  DT_CHR         2
#define  DT_DIR         4
#define  DT_BLK         6
#define  DT_REG         8
#define  DT_LNK         10
#define  DT_SOCK        12
#define  DT_WHT         14

For example, if you're looking for directories, you can add a loop to keep calling ::readdir_r() until it either returns NULL or the d_type field is DT_DIR:

    for ( ;; )
    {
        rc = ::readdir_r( ( DIR * ) dirp, dentbuffer, &dentp );
        if ( ( SUCCESS != rc ) || ( NULL == dentp ) )
        {
            return( NULL );
        }

        if ( dentp->d_type == DT_DIR )
        {
            break;
        }
    }
like image 44
Andrew Henle Avatar answered Nov 03 '22 17:11

Andrew Henle