Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to convert minizip wrapper to unicode?

Tags:

c++

c

zlib

I am trying to use minzip wrapper from [http://www.zlib.net/] for zipping a folder. It works fine as long as the file names are in english. Has anyone tried to modify minizip to support unicode?

The modified code is posted below. The problem is with this function, the second argument takes const char* as the input. When I do the conversion it looses data and files names are not the same.

eg: Chinese-統一碼.txt becomes Chinese-t+ƒS+Çtáü.txt inside zip.

err = zipOpenNewFileInZip3_64(  zf,outstr.c_str(),&zi,
                                        NULL,0,NULL,0,NULL /* comment*/,
                                        (opt_compress_level != 0) ? Z_DEFLATED : 0,
                                        opt_compress_level,0,
                                        /* -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY, */
                                        -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY,
                                        password,crcFile, zip64);








minizip.c
Version 1.1, February 14h, 2010
sample part of the MiniZip project - ( http://www.winimage.com/zLibDll/minizip.html )

Copyright (C) 1998-2010 Gilles Vollant (minizip) ( http://www.winimage.com/zLibDll/minizip.html )

Modifications of Unzip for Zip64
Copyright (C) 2007-2008 Even Rouault

Modifications for Zip64 support on both zip and unzip
Copyright (C) 2009-2010 Mathias Svensson ( http://result42.com )
*/


#if (!defined(_WIN32)) && (!defined(WIN32)) && (!defined(__APPLE__))
#ifndef __USE_FILE_OFFSET64
#define __USE_FILE_OFFSET64
#endif
#ifndef __USE_LARGEFILE64
#define __USE_LARGEFILE64
#endif
#ifndef _LARGEFILE64_SOURCE
#define _LARGEFILE64_SOURCE
#endif
#ifndef _FILE_OFFSET_BIT
#define _FILE_OFFSET_BIT 64
#endif
#endif

#ifdef __APPLE__
// In darwin and perhaps other BSD variants off_t is a 64 bit value, hence no need for specific 64 bit functions
#define FOPEN_FUNC(filename, mode) fopen(filename, mode)
#define FTELLO_FUNC(stream) ftello(stream)
#define FSEEKO_FUNC(stream, offset, origin) fseeko(stream, offset, origin)
#else
#define FOPEN_FUNC(filename, mode) fopen64(filename, mode)
#define FTELLO_FUNC(stream) ftello64(stream)
#define FSEEKO_FUNC(stream, offset, origin) fseeko64(stream, offset, origin)
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <errno.h>
#include <fcntl.h>
#ifdef _WIN32
# include <direct.h>
# include <io.h>
#define GetCurrentDir _getcwd
#else
# include <unistd.h>
# include <utime.h>
# include <sys/types.h>
# include <sys/stat.h>
#endif

#include "zip.h"
#include "Shlwapi.h"

#ifdef _WIN32
#define USEWIN32IOAPI
#include "iowin32.h"
#endif

#include <windows.h>
#include <string>
#include <iostream>
#include <list>
#include <fstream>
#include <sstream>
#include <set>
using namespace std;

#define WRITEBUFFERSIZE (16384)
#define MAXFILENAME (256)

#ifdef _WIN32
uLong filetime(
    wchar_t *f,                /* name of file to get info on */
    tm_zip *tmzip,             /* return value: access, modific. and creation times */
    uLong *dt)           /* dostime */
{
    int ret = 0;
    {
        FILETIME ftLocal;
        HANDLE hFind;
        _WIN32_FIND_DATAW ff32;

        hFind = FindFirstFileW(f,&ff32);
        if (hFind != INVALID_HANDLE_VALUE)
        {
            FileTimeToLocalFileTime(&(ff32.ftLastWriteTime),&ftLocal);
            FileTimeToDosDateTime(&ftLocal,((LPWORD)dt)+1,((LPWORD)dt)+0);
            FindClose(hFind);
            ret = 1;
        }
    }
    return ret;
}
#else
#ifdef unix || __APPLE__
uLong filetime(f, tmzip, dt)
    char *f;               /* name of file to get info on */
tm_zip *tmzip;         /* return value: access, modific. and creation times */
uLong *dt;             /* dostime */
{
    int ret=0;
    struct stat s;        /* results of stat() */
    struct tm* filedate;
    time_t tm_t=0;

    if (strcmp(f,"-")!=0)
    {
        char name[MAXFILENAME+1];
        int len = strlen(f);
        if (len > MAXFILENAME)
            len = MAXFILENAME;

        strncpy(name, f,MAXFILENAME-1);
        /* strncpy doesnt append the trailing NULL, of the string is too long. */
        name[ MAXFILENAME ] = '\0';

        if (name[len - 1] == '/')
            name[len - 1] = '\0';
        /* not all systems allow stat'ing a file with / appended */
        if (stat(name,&s)==0)
        {
            tm_t = s.st_mtime;
            ret = 1;
        }
    }
    filedate = localtime(&tm_t);

    tmzip->tm_sec  = filedate->tm_sec;
    tmzip->tm_min  = filedate->tm_min;
    tmzip->tm_hour = filedate->tm_hour;
    tmzip->tm_mday = filedate->tm_mday;
    tmzip->tm_mon  = filedate->tm_mon ;
    tmzip->tm_year = filedate->tm_year;

    return ret;
}
#else
uLong filetime(f, tmzip, dt)
    char *f;                /* name of file to get info on */
tm_zip *tmzip;             /* return value: access, modific. and creation times */
uLong *dt;             /* dostime */
{
    return 0;
}
#endif
#endif

void addFileToList(list<wstring>& fileList, const wstring& directory, const set<wstring>& excludeFilterSet, const wstring& fileName )
{
    wstring fileExtension = fileName.substr(fileName.find_last_of(L".") + 1);
    if (!fileExtension.empty()) 
    {
        if (excludeFilterSet.find(fileExtension) != excludeFilterSet.end()) return;
    } 

    fileList.push_back(directory + fileName);
}

void GetFileListing(list<wstring>& fileList, wstring directory,const set<wstring>& excludeFilterSet,bool recursively=true)
{
    directory = directory + L"\\";
    wstring filter = directory + L"*";

    _WIN32_FIND_DATAW FindFileData;
    HANDLE hFind = FindFirstFileW(filter.c_str(), &FindFileData);

    if (hFind == INVALID_HANDLE_VALUE)
    {
        DWORD dwError = GetLastError();
        if (dwError != ERROR_FILE_NOT_FOUND)
        {
            //cout << "Invalid file handle for filter " << filter << ". Error is " << GetLastError() << endl;
        }
        return;
    }

    do
    {
        if (FindFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) 
        {
            if ((recursively) && (wcscmp(FindFileData.cFileName, L".") != 0) && (wcscmp(FindFileData.cFileName, L"..") != 0))
            {
                GetFileListing(fileList, directory + FindFileData.cFileName, excludeFilterSet);
            }
        } 
        else
        {
            addFileToList(fileList, directory, excludeFilterSet,FindFileData.cFileName);
        }
    } while (FindNextFileW(hFind, &FindFileData) != 0);

    DWORD dwError = GetLastError();
    FindClose(hFind);

    if (dwError != ERROR_NO_MORE_FILES)
    {
        //cout << "FindNextFile error. Error is "<< dwError << endl;
    }
}


int check_exist_file(wchar_t* filename)
{
    FILE* ftestexist;
    int ret = 1;
    //ftestexist = FOPEN_FUNC(filename,"rb");
    ftestexist = _wfopen(filename,L"rb");
    if (ftestexist==NULL)
        ret = 0;
    else
        fclose(ftestexist);
    return ret;
}

/* calculate the CRC32 of a file,
because to encrypt a file, we need known the CRC32 of the file before */
int getFileCrc(const wchar_t * filenameinzip,void*buf,unsigned long size_buf,unsigned long* result_crc)
{
    unsigned long calculate_crc=0;
    int err=ZIP_OK;
    //FILE * fin = FOPEN_FUNC(filenameinzip,"rb");
    FILE * fin = _wfopen(filenameinzip,L"rb");

    unsigned long size_read = 0;
    unsigned long total_read = 0;
    if (fin==NULL)
    {
        err = ZIP_ERRNO;
    }

    if (err == ZIP_OK)
        do
        {
            err = ZIP_OK;
            size_read = (int)fread(buf,1,size_buf,fin);
            if (size_read < size_buf)
                if (feof(fin)==0)
                {
                    printf("error in reading %s\n",filenameinzip);
                    err = ZIP_ERRNO;
                }

                if (size_read>0)
                    calculate_crc = crc32(calculate_crc,(const Bytef *)buf,size_read);
                total_read += size_read;

        } while ((err == ZIP_OK) && (size_read>0));

        if (fin)
            fclose(fin);

        *result_crc=calculate_crc;
        printf("file %s crc %lx\n", filenameinzip, calculate_crc);
        return err;
}

int isLargeFile(const wchar_t * filename)
{
    int largeFile = 0;
    ZPOS64_T pos = 0;
    //FILE* pFile = FOPEN_FUNC(filename, "rb");
    FILE* pFile = _wfopen(filename, L"rb");

    if(pFile != NULL)
    {
        int n = FSEEKO_FUNC(pFile, 0, SEEK_END);
        pos = FTELLO_FUNC(pFile);

        printf("File : %s is %lld bytes\n", filename, pos);

        if(pos >= 0xffffffff)
            largeFile = 1;

        fclose(pFile);
    }

    return largeFile;
}

void split( const wstring& text, wchar_t delimiter,set<wstring>& result )
{
    wstring::size_type start = 0;
    wstring::size_type end   = text.find( delimiter, start );
    wstring token;

    while( end != wstring::npos )
    {
        token = text.substr( start, end - start );
        token.erase(0,2);
        result.insert( token );
        start = end + 1;
        end   = text.find( delimiter, start );
    }

    // Add the last string
    token = text.substr(start);
    token = token.erase(0,2);
    result.insert(token);
}

//Do not call me.
long getUTF8size(const wchar_t *string){
    if (!string)
        return 0;
    long res=0;
    for (;*string;string++){
        if (*string<0x80)
            res++;
        else if (*string<0x800)
            res+=2;
        else
            res+=3;
    }
    return res;
}

char *WChar_to_UTF8(const wchar_t *string){
    long fSize=getUTF8size(string);
    char *res=new char[fSize+1];
    res[fSize]=0;
    if (!string)
        return res;
    long b=0;
    for (;*string;string++,b++){
        if (*string<0x80)
            res[b]=(char)*string;
        else if (*string<0x800){
            res[b++]=(*string>>6)|192;
            res[b]=*string&63|128;
        }else{
            res[b++]=(*string>>12)|224;
            res[b++]=((*string&4095)>>6)|128;
            res[b]=*string&63|128;
        }
    }
    return res;
}


std::string utf8_encode(const std::wstring &wstr)
{
    int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
    std::string strTo( size_needed, 0 );
    WideCharToMultiByte                  (CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
    return strTo;
}


wstring zipper( const wstring& directoryPath, const wstring& strExcludeFilter, wstring & zipFileName )
{
    int opt_overwrite=0,opt_compress_level=Z_BEST_COMPRESSION,opt_exclude_path=0,err=0,size_buf=0;
    void* buf=NULL;
    const char* password=NULL;
    list<wstring> fileList;
    DWORD dwRet;

    wchar_t cCurrentPath[MAX_PATH];
    dwRet = GetCurrentDirectoryW(MAX_PATH, cCurrentPath);
    if( dwRet == 0 )
    {
        return wstring();
    }

    // Change the directory to the current folder
    _wchdir(directoryPath.c_str());
    set<wstring> excludeFilterSet;
    split(strExcludeFilter,'|',excludeFilterSet);

    GetFileListing(fileList, directoryPath,excludeFilterSet);
    opt_overwrite = 1;

    size_buf = WRITEBUFFERSIZE;
    buf = (void*)malloc(size_buf);
    if (buf==NULL) return wstring();

    wchar_t tempDirPath[MAX_PATH];
    dwRet = GetTempPathW (MAX_PATH, tempDirPath);
    if( dwRet == 0 ) return wstring();

    wstring directoryName,zipFilePath;
    _WIN32_FIND_DATAW FindFileData;
    HANDLE hFind = FindFirstFileW(directoryPath.c_str(), &FindFileData);
    if (FindFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) 
    {
        directoryName = FindFileData.cFileName;
    }

    zipFilePath = wstring(tempDirPath)+L"\\"+directoryName+L".zip";
    zipFile zf;
    int errclose;

    #ifdef USEWIN32IOAPI
        zlib_filefunc64_def ffunc;
        fill_win32_filefunc64W (&ffunc);
        zf = zipOpen2_64(zipFilePath.c_str(),(opt_overwrite==2) ? 2 : 0,NULL,&ffunc);
    #   else
        zf = zipOpen64(zipFilePath.c_str(),(opt_overwrite==2) ? 2 : 0);
    #   endif

    if (zf == NULL)
    {
        //printf("error opening %s\n",filename_try);
        err= ZIP_ERRNO;
    }
    else
    {
        //printf("creating %s\n",filename_try);
    }

    for(list<wstring>::iterator it = fileList.begin() ; it!=fileList.end();++it) 
    {
        FILE * fin;
        int size_read;
        //const char* filenameinzip = (*it).c_str();
        wstring filenameinzip = (*it).c_str();
        wchar_t szOut[MAX_PATH];

        PathRelativePathToW(szOut,
                            directoryPath.c_str(),
                            FILE_ATTRIBUTE_DIRECTORY,
                            filenameinzip.c_str(),
                            FILE_ATTRIBUTE_NORMAL);

        wchar_t *savefilenameinzip;
        zip_fileinfo zi;
        unsigned long crcFile=0;
        int zip64 = 0;

        zi.tmz_date.tm_sec = zi.tmz_date.tm_min = zi.tmz_date.tm_hour =
            zi.tmz_date.tm_mday = zi.tmz_date.tm_mon = zi.tmz_date.tm_year = 0;
        zi.dosDate = 0;
        zi.internal_fa = 0;
    zi.external_fa = 0;
        filetime(szOut,&zi.tmz_date,&zi.dosDate);

        if ((password != NULL) && (err==ZIP_OK))
            err = getFileCrc(szOut,buf,size_buf,&crcFile);

        zip64 = isLargeFile(szOut);

        /* The path name saved, should not include a leading slash. */
        /*if it did, windows/xp and dynazip couldn't read the zip file. */
        savefilenameinzip = szOut;
        while( savefilenameinzip[0] == '\\' || savefilenameinzip[0] == '/' )
        {
            savefilenameinzip++;
        }

        string outstr = utf8_encode(savefilenameinzip);
        //char * op = (char*)outstr.c_str();


        err = zipOpenNewFileInZip3_64(  zf,outstr.c_str(),&zi,
                                        NULL,0,NULL,0,NULL /* comment*/,
                                        (opt_compress_level != 0) ? Z_DEFLATED : 0,
                                        opt_compress_level,0,
                                        /* -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY, */
                                        -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY,
                                        password,crcFile, zip64);

        if (err != ZIP_OK)
        {
            //printf("error in opening %s in zipfile\n",szOut);
        }
        else
        {
            //fin = FOPEN_FUNC(szOut,"rb");
            fin = _wfopen(szOut,L"rb");

            if (fin==NULL)
            {
                err=ZIP_ERRNO;
                //printf("error in opening %s for reading\n",szOut);
            }
        }

        if (err == ZIP_OK)
            do
            {
                err = ZIP_OK;
                size_read = (int)fread(buf,1,size_buf,fin);
                if (size_read < size_buf)
                    if (feof(fin)==0)
                    {
                        //printf("error in reading %s\n",szOut);
                        err = ZIP_ERRNO;
                    }

                    if (size_read>0)
                    {
                        err = zipWriteInFileInZip (zf,buf,size_read);
                        if (err<0)
                        {
                            //printf("error in writing %s in the zipfile\n",szOut);
                        }

                    }
            } while ((err == ZIP_OK) && (size_read>0));

            if (fin)
                fclose(fin);

            if (err<0)
                err=ZIP_ERRNO;
            else
            {
                err = zipCloseFileInZip(zf);
                if (err!=ZIP_OK)
                {
                    //printf("error in closing %s in the zipfile\n",szOut);
                }
            }
    }

    errclose = zipClose(zf,NULL);
    if (errclose != ZIP_OK)
    {
        //printf("error in closing %s\n",filename_try);
    }

    free(buf);
    // Change back the executabe context
    _wchdir(cCurrentPath);
    return zipFilePath;
}
like image 428
RP. Avatar asked Jan 31 '13 12:01

RP.


1 Answers

The official way to store UTF-8 file names in a ZIP file, according to the standard, is setting “general purpose bit 11”. Looking at the minizip sources, it seems to me that minizip will not set this bit for you at any time, and that zipOpenNewFileInZip3_64 provides no way to pass this bit. There is however a zipOpenNewFileInZip4_64 which accepts two more arguments, versionMadeBy and flagBase. So you might be able to store UTF-8 file names in accordance with the standard by changing your call to

err = zipOpenNewFileInZip4_64(zf, outstr.c_str(), […], crcFile, 36, 1<<11, zip64);

This is assuming that outstr actually does contain a valid UTF-8 encoding of your file name, which the source code suggests it should, but which I haven't verified. I suggest you print the hexadecimal values of the bytes of outstr to verify this. Unless I garbled the string in the process, your “統一碼.txt” should become e7 b5 b1 e4 b8 80 e7 a2 bc 2e 74 78 74 in hexadecimal UTF-8.

For details on this versionMadeBy field (which I set to 36 in my call), look at section 4.4.2 of the standard. It depends on what platform you are using, what format the file attributes from zipfi argument (&zi in your case) have, and what version of the standard everything adhers to. As you are using unicode file names, I'd say you're using version 6.3.* of the standard, so the lower byte should be 36. And as the minizip.c wrapper does not store any file attributes at all, you don't have to specify a platform there. The lack of attributes can be seen from these lines:

    zi.internal_fa = 0;
    zi.external_fa = 0;

Note that even though the standard does provide a way to denote unicode file names, that part has only been added in 2006, and there might still be many ZIP applications out there which do not support it. So even if your archive is correct, your unzip utility might still unzip this file incorrectly, interpreting the UTF-8 bytes as codepage 437 or latin 1 or similar.

like image 149
MvG Avatar answered Sep 22 '22 02:09

MvG