1
0

unicode.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. #include "rar.hpp"
  2. #define MBFUNCTIONS
  3. #if defined(_UNIX) && defined(MBFUNCTIONS)
  4. static bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success);
  5. static void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success);
  6. // In Unix we map high ASCII characters which cannot be converted to Unicode
  7. // to 0xE000 - 0xE0FF private use Unicode area.
  8. static const uint MapAreaStart=0xE000;
  9. // Mapped string marker. Initially we used 0xFFFF for this purpose,
  10. // but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker).
  11. // While we could workaround it, it is safer to use another character.
  12. static const uint MappedStringMark=0xFFFE;
  13. #endif
  14. bool WideToChar(const wchar *Src,char *Dest,size_t DestSize)
  15. {
  16. bool RetCode=true;
  17. *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
  18. #ifdef _WIN_ALL
  19. if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0)
  20. RetCode=false;
  21. // wcstombs is broken in Android NDK r9.
  22. #elif defined(_APPLE)
  23. WideToUtf(Src,Dest,DestSize);
  24. #elif defined(MBFUNCTIONS)
  25. if (!WideToCharMap(Src,Dest,DestSize,RetCode))
  26. {
  27. mbstate_t ps; // Use thread safe external state based functions.
  28. memset (&ps, 0, sizeof(ps));
  29. const wchar *SrcParam=Src; // wcsrtombs can change the pointer.
  30. // Some implementations of wcsrtombs can cause memory analyzing tools
  31. // like valgrind to report uninitialized data access. It happens because
  32. // internally these implementations call SSE4 based wcslen function,
  33. // which reads 16 bytes at once including those beyond of trailing 0.
  34. size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
  35. if (ResultingSize==(size_t)-1 && errno==EILSEQ)
  36. {
  37. // Aborted on inconvertible character not zero terminating the result.
  38. // EILSEQ helps to distinguish it from small output buffer abort.
  39. // We want to convert as much as we can, so we clean the output buffer
  40. // and repeat conversion.
  41. memset (&ps, 0, sizeof(ps));
  42. SrcParam=Src; // wcsrtombs can change the pointer.
  43. memset(Dest,0,DestSize);
  44. ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
  45. }
  46. if (ResultingSize==(size_t)-1)
  47. RetCode=false;
  48. if (ResultingSize==0 && *Src!=0)
  49. RetCode=false;
  50. }
  51. #else
  52. for (int I=0;I<DestSize;I++)
  53. {
  54. Dest[I]=(char)Src[I];
  55. if (Src[I]==0)
  56. break;
  57. }
  58. #endif
  59. if (DestSize>0)
  60. Dest[DestSize-1]=0;
  61. // We tried to return the empty string if conversion is failed,
  62. // but it does not work well. WideCharToMultiByte returns 'failed' code
  63. // and partially converted string even if we wanted to convert only a part
  64. // of string and passed DestSize smaller than required for fully converted
  65. // string. Such call is the valid behavior in RAR code and we do not expect
  66. // the empty string in this case.
  67. return RetCode;
  68. }
  69. bool CharToWide(const char *Src,wchar *Dest,size_t DestSize)
  70. {
  71. bool RetCode=true;
  72. *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
  73. #ifdef _WIN_ALL
  74. if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0)
  75. RetCode=false;
  76. // mbstowcs is broken in Android NDK r9.
  77. #elif defined(_APPLE)
  78. UtfToWide(Src,Dest,DestSize);
  79. #elif defined(MBFUNCTIONS)
  80. mbstate_t ps;
  81. memset (&ps, 0, sizeof(ps));
  82. const char *SrcParam=Src; // mbsrtowcs can change the pointer.
  83. size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps);
  84. if (ResultingSize==(size_t)-1)
  85. RetCode=false;
  86. if (ResultingSize==0 && *Src!=0)
  87. RetCode=false;
  88. if (RetCode==false && DestSize>1)
  89. CharToWideMap(Src,Dest,DestSize,RetCode);
  90. #else
  91. for (int I=0;I<DestSize;I++)
  92. {
  93. Dest[I]=(wchar_t)Src[I];
  94. if (Src[I]==0)
  95. break;
  96. }
  97. #endif
  98. if (DestSize>0)
  99. Dest[DestSize-1]=0;
  100. // We tried to return the empty string if conversion is failed,
  101. // but it does not work well. MultiByteToWideChar returns 'failed' code
  102. // even if we wanted to convert only a part of string and passed DestSize
  103. // smaller than required for fully converted string. Such call is the valid
  104. // behavior in RAR code and we do not expect the empty string in this case.
  105. return RetCode;
  106. }
  107. #if defined(_UNIX) && defined(MBFUNCTIONS)
  108. // Convert and restore mapped inconvertible Unicode characters.
  109. // We use it for extended ASCII names in Unix.
  110. bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success)
  111. {
  112. // String with inconvertible characters mapped to private use Unicode area
  113. // must have the mark code somewhere.
  114. if (wcschr(Src,(wchar)MappedStringMark)==NULL)
  115. return false;
  116. // Seems to be that wcrtomb in some memory analyzing libraries
  117. // can produce uninitilized output while reporting success on garbage input.
  118. // So we clean the destination to calm analyzers.
  119. memset(Dest,0,DestSize);
  120. Success=true;
  121. uint SrcPos=0,DestPos=0;
  122. while (Src[SrcPos]!=0 && DestPos<DestSize-MB_CUR_MAX)
  123. {
  124. if (uint(Src[SrcPos])==MappedStringMark)
  125. {
  126. SrcPos++;
  127. continue;
  128. }
  129. // For security reasons do not restore low ASCII codes, so mapping cannot
  130. // be used to hide control codes like path separators.
  131. if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100)
  132. Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart);
  133. else
  134. {
  135. mbstate_t ps;
  136. memset(&ps,0,sizeof(ps));
  137. if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==(size_t)-1)
  138. {
  139. Dest[DestPos]='_';
  140. Success=false;
  141. }
  142. SrcPos++;
  143. memset(&ps,0,sizeof(ps));
  144. int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps);
  145. DestPos+=Max(Length,1);
  146. }
  147. }
  148. Dest[Min(DestPos,DestSize-1)]=0;
  149. return true;
  150. }
  151. #endif
  152. #if defined(_UNIX) && defined(MBFUNCTIONS)
  153. // Convert and map inconvertible Unicode characters.
  154. // We use it for extended ASCII names in Unix.
  155. void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success)
  156. {
  157. // Map inconvertible characters to private use Unicode area 0xE000.
  158. // Mark such string by placing special non-character code before
  159. // first inconvertible character.
  160. Success=false;
  161. bool MarkAdded=false;
  162. uint SrcPos=0,DestPos=0;
  163. while (DestPos<DestSize)
  164. {
  165. if (Src[SrcPos]==0)
  166. {
  167. Success=true;
  168. break;
  169. }
  170. mbstate_t ps;
  171. memset(&ps,0,sizeof(ps));
  172. size_t res=mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps);
  173. if (res==(size_t)-1 || res==(size_t)-2)
  174. {
  175. // For security reasons we do not want to map low ASCII characters,
  176. // so we do not have additional .. and path separator codes.
  177. if (byte(Src[SrcPos])>=0x80)
  178. {
  179. if (!MarkAdded)
  180. {
  181. Dest[DestPos++]=MappedStringMark;
  182. MarkAdded=true;
  183. if (DestPos>=DestSize)
  184. break;
  185. }
  186. Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart;
  187. }
  188. else
  189. break;
  190. }
  191. else
  192. {
  193. memset(&ps,0,sizeof(ps));
  194. int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps);
  195. SrcPos+=Max(Length,1);
  196. DestPos++;
  197. }
  198. }
  199. Dest[Min(DestPos,DestSize-1)]=0;
  200. }
  201. #endif
  202. // SrcSize is in wide characters, not in bytes.
  203. byte* WideToRaw(const wchar *Src,byte *Dest,size_t SrcSize)
  204. {
  205. for (size_t I=0;I<SrcSize;I++,Src++)
  206. {
  207. Dest[I*2]=(byte)*Src;
  208. Dest[I*2+1]=(byte)(*Src>>8);
  209. if (*Src==0)
  210. break;
  211. }
  212. return Dest;
  213. }
  214. wchar* RawToWide(const byte *Src,wchar *Dest,size_t DestSize)
  215. {
  216. for (size_t I=0;I<DestSize;I++)
  217. if ((Dest[I]=Src[I*2]+(Src[I*2+1]<<8))==0)
  218. break;
  219. return Dest;
  220. }
  221. void WideToUtf(const wchar *Src,char *Dest,size_t DestSize)
  222. {
  223. long dsize=(long)DestSize;
  224. dsize--;
  225. while (*Src!=0 && --dsize>=0)
  226. {
  227. uint c=*(Src++);
  228. if (c<0x80)
  229. *(Dest++)=c;
  230. else
  231. if (c<0x800 && --dsize>=0)
  232. {
  233. *(Dest++)=(0xc0|(c>>6));
  234. *(Dest++)=(0x80|(c&0x3f));
  235. }
  236. else
  237. {
  238. if (c>=0xd800 && c<=0xdbff && *Src>=0xdc00 && *Src<=0xdfff) // Surrogate pair.
  239. {
  240. c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000;
  241. Src++;
  242. }
  243. if (c<0x10000 && (dsize-=2)>=0)
  244. {
  245. *(Dest++)=(0xe0|(c>>12));
  246. *(Dest++)=(0x80|((c>>6)&0x3f));
  247. *(Dest++)=(0x80|(c&0x3f));
  248. }
  249. else
  250. if (c < 0x200000 && (dsize-=3)>=0)
  251. {
  252. *(Dest++)=(0xf0|(c>>18));
  253. *(Dest++)=(0x80|((c>>12)&0x3f));
  254. *(Dest++)=(0x80|((c>>6)&0x3f));
  255. *(Dest++)=(0x80|(c&0x3f));
  256. }
  257. }
  258. }
  259. *Dest=0;
  260. }
  261. size_t WideToUtfSize(const wchar *Src)
  262. {
  263. size_t Size=0;
  264. for (;*Src!=0;Src++)
  265. if (*Src<0x80)
  266. Size++;
  267. else
  268. if (*Src<0x800)
  269. Size+=2;
  270. else
  271. if ((uint)*Src<0x10000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
  272. {
  273. if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff)
  274. {
  275. Size+=4; // 4 output bytes for Unicode surrogate pair.
  276. Src++;
  277. }
  278. else
  279. Size+=3;
  280. }
  281. else
  282. if ((uint)*Src<0x200000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
  283. Size+=4;
  284. return Size+1; // Include terminating zero.
  285. }
  286. bool UtfToWide(const char *Src,wchar *Dest,size_t DestSize)
  287. {
  288. bool Success=true;
  289. long dsize=(long)DestSize;
  290. dsize--;
  291. while (*Src!=0)
  292. {
  293. uint c=byte(*(Src++)),d;
  294. if (c<0x80)
  295. d=c;
  296. else
  297. if ((c>>5)==6)
  298. {
  299. if ((*Src&0xc0)!=0x80)
  300. {
  301. Success=false;
  302. break;
  303. }
  304. d=((c&0x1f)<<6)|(*Src&0x3f);
  305. Src++;
  306. }
  307. else
  308. if ((c>>4)==14)
  309. {
  310. if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80)
  311. {
  312. Success=false;
  313. break;
  314. }
  315. d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f);
  316. Src+=2;
  317. }
  318. else
  319. if ((c>>3)==30)
  320. {
  321. if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80)
  322. {
  323. Success=false;
  324. break;
  325. }
  326. d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f);
  327. Src+=3;
  328. }
  329. else
  330. {
  331. Success=false;
  332. break;
  333. }
  334. if (--dsize<0)
  335. break;
  336. if (d>0xffff)
  337. {
  338. if (--dsize<0)
  339. break;
  340. if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
  341. {
  342. Success=false;
  343. continue;
  344. }
  345. if (sizeof(*Dest)==2) // Use the surrogate pair.
  346. {
  347. *(Dest++)=((d-0x10000)>>10)+0xd800;
  348. *(Dest++)=(d&0x3ff)+0xdc00;
  349. }
  350. else
  351. *(Dest++)=d;
  352. }
  353. else
  354. *(Dest++)=d;
  355. }
  356. *Dest=0;
  357. return Success;
  358. }
  359. // For zero terminated strings.
  360. bool IsTextUtf8(const byte *Src)
  361. {
  362. return IsTextUtf8(Src,strlen((const char *)Src));
  363. }
  364. // Source data can be both with and without UTF-8 BOM.
  365. bool IsTextUtf8(const byte *Src,size_t SrcSize)
  366. {
  367. while (SrcSize-- > 0)
  368. {
  369. byte C=*(Src++);
  370. int HighOne=0; // Number of leftmost '1' bits.
  371. for (byte Mask=0x80;Mask!=0 && (C & Mask)!=0;Mask>>=1)
  372. HighOne++;
  373. if (HighOne==1 || HighOne>6)
  374. return false;
  375. while (--HighOne > 0)
  376. if (SrcSize-- <= 0 || (*(Src++) & 0xc0)!=0x80)
  377. return false;
  378. }
  379. return true;
  380. }
  381. int wcsicomp(const wchar *s1,const wchar *s2)
  382. {
  383. #ifdef _WIN_ALL
  384. return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,-1,s2,-1)-2;
  385. #else
  386. while (true)
  387. {
  388. wchar u1 = towupper(*s1);
  389. wchar u2 = towupper(*s2);
  390. if (u1 != u2)
  391. return u1 < u2 ? -1 : 1;
  392. if (*s1==0)
  393. break;
  394. s1++;
  395. s2++;
  396. }
  397. return 0;
  398. #endif
  399. }
  400. int wcsnicomp(const wchar *s1,const wchar *s2,size_t n)
  401. {
  402. #ifdef _WIN_ALL
  403. // If we specify 'n' exceeding the actual string length, CompareString goes
  404. // beyond the trailing zero and compares garbage. So we need to limit 'n'
  405. // to real string length.
  406. size_t l1=Min(wcslen(s1)+1,n);
  407. size_t l2=Min(wcslen(s2)+1,n);
  408. return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2;
  409. #else
  410. if (n==0)
  411. return 0;
  412. while (true)
  413. {
  414. wchar u1 = towupper(*s1);
  415. wchar u2 = towupper(*s2);
  416. if (u1 != u2)
  417. return u1 < u2 ? -1 : 1;
  418. if (*s1==0 || --n==0)
  419. break;
  420. s1++;
  421. s2++;
  422. }
  423. return 0;
  424. #endif
  425. }
  426. // Case insensitive wcsstr().
  427. const wchar_t* wcscasestr(const wchar_t *str, const wchar_t *search)
  428. {
  429. for (size_t i=0;str[i]!=0;i++)
  430. for (size_t j=0;;j++)
  431. {
  432. if (search[j]==0)
  433. return str+i;
  434. if (tolowerw(str[i+j])!=tolowerw(search[j]))
  435. break;
  436. }
  437. return NULL;
  438. }
  439. #ifndef SFX_MODULE
  440. wchar* wcslower(wchar *s)
  441. {
  442. #ifdef _WIN_ALL
  443. // _wcslwr requires setlocale and we do not want to depend on setlocale
  444. // in Windows. Also CharLower involves less overhead.
  445. CharLower(s);
  446. #else
  447. for (wchar *c=s;*c!=0;c++)
  448. *c=towlower(*c);
  449. #endif
  450. return s;
  451. }
  452. #endif
  453. #ifndef SFX_MODULE
  454. wchar* wcsupper(wchar *s)
  455. {
  456. #ifdef _WIN_ALL
  457. // _wcsupr requires setlocale and we do not want to depend on setlocale
  458. // in Windows. Also CharUpper involves less overhead.
  459. CharUpper(s);
  460. #else
  461. for (wchar *c=s;*c!=0;c++)
  462. *c=towupper(*c);
  463. #endif
  464. return s;
  465. }
  466. #endif
  467. int toupperw(int ch)
  468. {
  469. #if defined(_WIN_ALL)
  470. // CharUpper is more reliable than towupper in Windows, which seems to be
  471. // C locale dependent even in Unicode version. For example, towupper failed
  472. // to convert lowercase Russian characters. Use 0xffff mask to prevent crash
  473. // if value larger than 0xffff is passed to this function.
  474. return (int)(INT_PTR)CharUpper((wchar *)(INT_PTR)(ch&0xffff));
  475. #else
  476. return towupper(ch);
  477. #endif
  478. }
  479. int tolowerw(int ch)
  480. {
  481. #if defined(_WIN_ALL)
  482. // CharLower is more reliable than towlower in Windows.
  483. // See comment for towupper above. Use 0xffff mask to prevent crash
  484. // if value larger than 0xffff is passed to this function.
  485. return (int)(INT_PTR)CharLower((wchar *)(INT_PTR)(ch&0xffff));
  486. #else
  487. return towlower(ch);
  488. #endif
  489. }
  490. int atoiw(const wchar *s)
  491. {
  492. return (int)atoilw(s);
  493. }
  494. int64 atoilw(const wchar *s)
  495. {
  496. bool sign=false;
  497. if (*s=='-') // We do use signed integers here, for example, in GUI SFX.
  498. {
  499. s++;
  500. sign=true;
  501. }
  502. // Use unsigned type here, since long string can overflow the variable
  503. // and signed integer overflow is undefined behavior in C++.
  504. uint64 n=0;
  505. while (*s>='0' && *s<='9')
  506. {
  507. n=n*10+(*s-'0');
  508. s++;
  509. }
  510. // Check int64(n)>=0 to avoid the signed overflow with undefined behavior
  511. // when negating 0x8000000000000000.
  512. return sign && int64(n)>=0 ? -int64(n) : int64(n);
  513. }
  514. #ifdef DBCS_SUPPORTED
  515. SupportDBCS gdbcs;
  516. SupportDBCS::SupportDBCS()
  517. {
  518. Init();
  519. }
  520. void SupportDBCS::Init()
  521. {
  522. CPINFO CPInfo;
  523. GetCPInfo(CP_ACP,&CPInfo);
  524. DBCSMode=CPInfo.MaxCharSize > 1;
  525. for (uint I=0;I<ASIZE(IsLeadByte);I++)
  526. IsLeadByte[I]=IsDBCSLeadByte(I)!=0;
  527. }
  528. char* SupportDBCS::charnext(const char *s)
  529. {
  530. // Zero cannot be the trail byte. So if next byte after the lead byte
  531. // is 0, the string is corrupt and we'll better return the pointer to 0,
  532. // to break string processing loops.
  533. return (char *)(IsLeadByte[(byte)*s] && s[1]!=0 ? s+2:s+1);
  534. }
  535. #endif