123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606 |
- #include "rar.hpp"
- #define MBFUNCTIONS
- #if defined(_UNIX) && defined(MBFUNCTIONS)
- static bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success);
- static void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success);
- // In Unix we map high ASCII characters which cannot be converted to Unicode
- // to 0xE000 - 0xE0FF private use Unicode area.
- static const uint MapAreaStart=0xE000;
- // Mapped string marker. Initially we used 0xFFFF for this purpose,
- // but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker).
- // While we could workaround it, it is safer to use another character.
- static const uint MappedStringMark=0xFFFE;
- #endif
- bool WideToChar(const wchar *Src,char *Dest,size_t DestSize)
- {
- bool RetCode=true;
- *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
- #ifdef _WIN_ALL
- if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0)
- RetCode=false;
- // wcstombs is broken in Android NDK r9.
- #elif defined(_APPLE)
- WideToUtf(Src,Dest,DestSize);
- #elif defined(MBFUNCTIONS)
- if (!WideToCharMap(Src,Dest,DestSize,RetCode))
- {
- mbstate_t ps; // Use thread safe external state based functions.
- memset (&ps, 0, sizeof(ps));
- const wchar *SrcParam=Src; // wcsrtombs can change the pointer.
- // Some implementations of wcsrtombs can cause memory analyzing tools
- // like valgrind to report uninitialized data access. It happens because
- // internally these implementations call SSE4 based wcslen function,
- // which reads 16 bytes at once including those beyond of trailing 0.
- size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
- if (ResultingSize==(size_t)-1 && errno==EILSEQ)
- {
- // Aborted on inconvertible character not zero terminating the result.
- // EILSEQ helps to distinguish it from small output buffer abort.
- // We want to convert as much as we can, so we clean the output buffer
- // and repeat conversion.
- memset (&ps, 0, sizeof(ps));
- SrcParam=Src; // wcsrtombs can change the pointer.
- memset(Dest,0,DestSize);
- ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
- }
- if (ResultingSize==(size_t)-1)
- RetCode=false;
- if (ResultingSize==0 && *Src!=0)
- RetCode=false;
- }
- #else
- for (int I=0;I<DestSize;I++)
- {
- Dest[I]=(char)Src[I];
- if (Src[I]==0)
- break;
- }
- #endif
- if (DestSize>0)
- Dest[DestSize-1]=0;
- // We tried to return the empty string if conversion is failed,
- // but it does not work well. WideCharToMultiByte returns 'failed' code
- // and partially converted string even if we wanted to convert only a part
- // of string and passed DestSize smaller than required for fully converted
- // string. Such call is the valid behavior in RAR code and we do not expect
- // the empty string in this case.
- return RetCode;
- }
- bool CharToWide(const char *Src,wchar *Dest,size_t DestSize)
- {
- bool RetCode=true;
- *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
- #ifdef _WIN_ALL
- if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0)
- RetCode=false;
- // mbstowcs is broken in Android NDK r9.
- #elif defined(_APPLE)
- UtfToWide(Src,Dest,DestSize);
- #elif defined(MBFUNCTIONS)
- mbstate_t ps;
- memset (&ps, 0, sizeof(ps));
- const char *SrcParam=Src; // mbsrtowcs can change the pointer.
- size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps);
- if (ResultingSize==(size_t)-1)
- RetCode=false;
- if (ResultingSize==0 && *Src!=0)
- RetCode=false;
- if (RetCode==false && DestSize>1)
- CharToWideMap(Src,Dest,DestSize,RetCode);
- #else
- for (int I=0;I<DestSize;I++)
- {
- Dest[I]=(wchar_t)Src[I];
- if (Src[I]==0)
- break;
- }
- #endif
- if (DestSize>0)
- Dest[DestSize-1]=0;
- // We tried to return the empty string if conversion is failed,
- // but it does not work well. MultiByteToWideChar returns 'failed' code
- // even if we wanted to convert only a part of string and passed DestSize
- // smaller than required for fully converted string. Such call is the valid
- // behavior in RAR code and we do not expect the empty string in this case.
- return RetCode;
- }
- #if defined(_UNIX) && defined(MBFUNCTIONS)
- // Convert and restore mapped inconvertible Unicode characters.
- // We use it for extended ASCII names in Unix.
- bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success)
- {
- // String with inconvertible characters mapped to private use Unicode area
- // must have the mark code somewhere.
- if (wcschr(Src,(wchar)MappedStringMark)==NULL)
- return false;
- // Seems to be that wcrtomb in some memory analyzing libraries
- // can produce uninitilized output while reporting success on garbage input.
- // So we clean the destination to calm analyzers.
- memset(Dest,0,DestSize);
-
- Success=true;
- uint SrcPos=0,DestPos=0;
- while (Src[SrcPos]!=0 && DestPos<DestSize-MB_CUR_MAX)
- {
- if (uint(Src[SrcPos])==MappedStringMark)
- {
- SrcPos++;
- continue;
- }
- // For security reasons do not restore low ASCII codes, so mapping cannot
- // be used to hide control codes like path separators.
- if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100)
- Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart);
- else
- {
- mbstate_t ps;
- memset(&ps,0,sizeof(ps));
- if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==(size_t)-1)
- {
- Dest[DestPos]='_';
- Success=false;
- }
- SrcPos++;
- memset(&ps,0,sizeof(ps));
- int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps);
- DestPos+=Max(Length,1);
- }
- }
- Dest[Min(DestPos,DestSize-1)]=0;
- return true;
- }
- #endif
- #if defined(_UNIX) && defined(MBFUNCTIONS)
- // Convert and map inconvertible Unicode characters.
- // We use it for extended ASCII names in Unix.
- void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success)
- {
- // Map inconvertible characters to private use Unicode area 0xE000.
- // Mark such string by placing special non-character code before
- // first inconvertible character.
- Success=false;
- bool MarkAdded=false;
- uint SrcPos=0,DestPos=0;
- while (DestPos<DestSize)
- {
- if (Src[SrcPos]==0)
- {
- Success=true;
- break;
- }
- mbstate_t ps;
- memset(&ps,0,sizeof(ps));
- size_t res=mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps);
- if (res==(size_t)-1 || res==(size_t)-2)
- {
- // For security reasons we do not want to map low ASCII characters,
- // so we do not have additional .. and path separator codes.
- if (byte(Src[SrcPos])>=0x80)
- {
- if (!MarkAdded)
- {
- Dest[DestPos++]=MappedStringMark;
- MarkAdded=true;
- if (DestPos>=DestSize)
- break;
- }
- Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart;
- }
- else
- break;
- }
- else
- {
- memset(&ps,0,sizeof(ps));
- int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps);
- SrcPos+=Max(Length,1);
- DestPos++;
- }
- }
- Dest[Min(DestPos,DestSize-1)]=0;
- }
- #endif
- // SrcSize is in wide characters, not in bytes.
- byte* WideToRaw(const wchar *Src,byte *Dest,size_t SrcSize)
- {
- for (size_t I=0;I<SrcSize;I++,Src++)
- {
- Dest[I*2]=(byte)*Src;
- Dest[I*2+1]=(byte)(*Src>>8);
- if (*Src==0)
- break;
- }
- return Dest;
- }
- wchar* RawToWide(const byte *Src,wchar *Dest,size_t DestSize)
- {
- for (size_t I=0;I<DestSize;I++)
- if ((Dest[I]=Src[I*2]+(Src[I*2+1]<<8))==0)
- break;
- return Dest;
- }
- void WideToUtf(const wchar *Src,char *Dest,size_t DestSize)
- {
- long dsize=(long)DestSize;
- dsize--;
- while (*Src!=0 && --dsize>=0)
- {
- uint c=*(Src++);
- if (c<0x80)
- *(Dest++)=c;
- else
- if (c<0x800 && --dsize>=0)
- {
- *(Dest++)=(0xc0|(c>>6));
- *(Dest++)=(0x80|(c&0x3f));
- }
- else
- {
- if (c>=0xd800 && c<=0xdbff && *Src>=0xdc00 && *Src<=0xdfff) // Surrogate pair.
- {
- c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000;
- Src++;
- }
- if (c<0x10000 && (dsize-=2)>=0)
- {
- *(Dest++)=(0xe0|(c>>12));
- *(Dest++)=(0x80|((c>>6)&0x3f));
- *(Dest++)=(0x80|(c&0x3f));
- }
- else
- if (c < 0x200000 && (dsize-=3)>=0)
- {
- *(Dest++)=(0xf0|(c>>18));
- *(Dest++)=(0x80|((c>>12)&0x3f));
- *(Dest++)=(0x80|((c>>6)&0x3f));
- *(Dest++)=(0x80|(c&0x3f));
- }
- }
- }
- *Dest=0;
- }
- size_t WideToUtfSize(const wchar *Src)
- {
- size_t Size=0;
- for (;*Src!=0;Src++)
- if (*Src<0x80)
- Size++;
- else
- if (*Src<0x800)
- Size+=2;
- else
- if ((uint)*Src<0x10000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
- {
- if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff)
- {
- Size+=4; // 4 output bytes for Unicode surrogate pair.
- Src++;
- }
- else
- Size+=3;
- }
- else
- if ((uint)*Src<0x200000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
- Size+=4;
- return Size+1; // Include terminating zero.
- }
- bool UtfToWide(const char *Src,wchar *Dest,size_t DestSize)
- {
- bool Success=true;
- long dsize=(long)DestSize;
- dsize--;
- while (*Src!=0)
- {
- uint c=byte(*(Src++)),d;
- if (c<0x80)
- d=c;
- else
- if ((c>>5)==6)
- {
- if ((*Src&0xc0)!=0x80)
- {
- Success=false;
- break;
- }
- d=((c&0x1f)<<6)|(*Src&0x3f);
- Src++;
- }
- else
- if ((c>>4)==14)
- {
- if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80)
- {
- Success=false;
- break;
- }
- d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f);
- Src+=2;
- }
- else
- if ((c>>3)==30)
- {
- if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80)
- {
- Success=false;
- break;
- }
- d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f);
- Src+=3;
- }
- else
- {
- Success=false;
- break;
- }
- if (--dsize<0)
- break;
- if (d>0xffff)
- {
- if (--dsize<0)
- break;
- if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
- {
- Success=false;
- continue;
- }
- if (sizeof(*Dest)==2) // Use the surrogate pair.
- {
- *(Dest++)=((d-0x10000)>>10)+0xd800;
- *(Dest++)=(d&0x3ff)+0xdc00;
- }
- else
- *(Dest++)=d;
- }
- else
- *(Dest++)=d;
- }
- *Dest=0;
- return Success;
- }
- // For zero terminated strings.
- bool IsTextUtf8(const byte *Src)
- {
- return IsTextUtf8(Src,strlen((const char *)Src));
- }
- // Source data can be both with and without UTF-8 BOM.
- bool IsTextUtf8(const byte *Src,size_t SrcSize)
- {
- while (SrcSize-- > 0)
- {
- byte C=*(Src++);
- int HighOne=0; // Number of leftmost '1' bits.
- for (byte Mask=0x80;Mask!=0 && (C & Mask)!=0;Mask>>=1)
- HighOne++;
- if (HighOne==1 || HighOne>6)
- return false;
- while (--HighOne > 0)
- if (SrcSize-- <= 0 || (*(Src++) & 0xc0)!=0x80)
- return false;
- }
- return true;
- }
- int wcsicomp(const wchar *s1,const wchar *s2)
- {
- #ifdef _WIN_ALL
- return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,-1,s2,-1)-2;
- #else
- while (true)
- {
- wchar u1 = towupper(*s1);
- wchar u2 = towupper(*s2);
- if (u1 != u2)
- return u1 < u2 ? -1 : 1;
- if (*s1==0)
- break;
- s1++;
- s2++;
- }
- return 0;
- #endif
- }
- int wcsnicomp(const wchar *s1,const wchar *s2,size_t n)
- {
- #ifdef _WIN_ALL
- // If we specify 'n' exceeding the actual string length, CompareString goes
- // beyond the trailing zero and compares garbage. So we need to limit 'n'
- // to real string length.
- size_t l1=Min(wcslen(s1)+1,n);
- size_t l2=Min(wcslen(s2)+1,n);
- return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2;
- #else
- if (n==0)
- return 0;
- while (true)
- {
- wchar u1 = towupper(*s1);
- wchar u2 = towupper(*s2);
- if (u1 != u2)
- return u1 < u2 ? -1 : 1;
- if (*s1==0 || --n==0)
- break;
- s1++;
- s2++;
- }
- return 0;
- #endif
- }
- // Case insensitive wcsstr().
- const wchar_t* wcscasestr(const wchar_t *str, const wchar_t *search)
- {
- for (size_t i=0;str[i]!=0;i++)
- for (size_t j=0;;j++)
- {
- if (search[j]==0)
- return str+i;
- if (tolowerw(str[i+j])!=tolowerw(search[j]))
- break;
- }
- return NULL;
- }
- #ifndef SFX_MODULE
- wchar* wcslower(wchar *s)
- {
- #ifdef _WIN_ALL
- // _wcslwr requires setlocale and we do not want to depend on setlocale
- // in Windows. Also CharLower involves less overhead.
- CharLower(s);
- #else
- for (wchar *c=s;*c!=0;c++)
- *c=towlower(*c);
- #endif
- return s;
- }
- #endif
- #ifndef SFX_MODULE
- wchar* wcsupper(wchar *s)
- {
- #ifdef _WIN_ALL
- // _wcsupr requires setlocale and we do not want to depend on setlocale
- // in Windows. Also CharUpper involves less overhead.
- CharUpper(s);
- #else
- for (wchar *c=s;*c!=0;c++)
- *c=towupper(*c);
- #endif
- return s;
- }
- #endif
- int toupperw(int ch)
- {
- #if defined(_WIN_ALL)
- // CharUpper is more reliable than towupper in Windows, which seems to be
- // C locale dependent even in Unicode version. For example, towupper failed
- // to convert lowercase Russian characters. Use 0xffff mask to prevent crash
- // if value larger than 0xffff is passed to this function.
- return (int)(INT_PTR)CharUpper((wchar *)(INT_PTR)(ch&0xffff));
- #else
- return towupper(ch);
- #endif
- }
- int tolowerw(int ch)
- {
- #if defined(_WIN_ALL)
- // CharLower is more reliable than towlower in Windows.
- // See comment for towupper above. Use 0xffff mask to prevent crash
- // if value larger than 0xffff is passed to this function.
- return (int)(INT_PTR)CharLower((wchar *)(INT_PTR)(ch&0xffff));
- #else
- return towlower(ch);
- #endif
- }
- int atoiw(const wchar *s)
- {
- return (int)atoilw(s);
- }
- int64 atoilw(const wchar *s)
- {
- bool sign=false;
- if (*s=='-') // We do use signed integers here, for example, in GUI SFX.
- {
- s++;
- sign=true;
- }
- // Use unsigned type here, since long string can overflow the variable
- // and signed integer overflow is undefined behavior in C++.
- uint64 n=0;
- while (*s>='0' && *s<='9')
- {
- n=n*10+(*s-'0');
- s++;
- }
- // Check int64(n)>=0 to avoid the signed overflow with undefined behavior
- // when negating 0x8000000000000000.
- return sign && int64(n)>=0 ? -int64(n) : int64(n);
- }
- #ifdef DBCS_SUPPORTED
- SupportDBCS gdbcs;
- SupportDBCS::SupportDBCS()
- {
- Init();
- }
- void SupportDBCS::Init()
- {
- CPINFO CPInfo;
- GetCPInfo(CP_ACP,&CPInfo);
- DBCSMode=CPInfo.MaxCharSize > 1;
- for (uint I=0;I<ASIZE(IsLeadByte);I++)
- IsLeadByte[I]=IsDBCSLeadByte(I)!=0;
- }
- char* SupportDBCS::charnext(const char *s)
- {
- // Zero cannot be the trail byte. So if next byte after the lead byte
- // is 0, the string is corrupt and we'll better return the pointer to 0,
- // to break string processing loops.
- return (char *)(IsLeadByte[(byte)*s] && s[1]!=0 ? s+2:s+1);
- }
- #endif
|