glue_x86.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524
  1. #ifndef _NSEEL_GLUE_X86_H_
  2. #define _NSEEL_GLUE_X86_H_
  3. #include <intrin.h>
  4. #define GLUE_MAX_FPSTACK_SIZE 8
  5. // endOfInstruction is end of jump with relative offset, offset is offset from end of instruction to jump to
  6. #define GLUE_JMP_SET_OFFSET(endOfInstruction,offset) (((int *)(endOfInstruction))[-1] = (offset))
  7. static const unsigned char GLUE_JMP_NC[] = { 0xE9, 0,0,0,0, }; // jmp<offset>
  8. static const unsigned char GLUE_JMP_IF_P1_Z[] = {0x85, 0xC0, 0x0F, 0x84, 0,0,0,0 }; // test eax, eax, jz
  9. static const unsigned char GLUE_JMP_IF_P1_NZ[] = {0x85, 0xC0, 0x0F, 0x85, 0,0,0,0 }; // test eax, eax, jnz
  10. #define GLUE_FUNC_ENTER_SIZE 0
  11. #define GLUE_FUNC_LEAVE_SIZE 0
  12. const static unsigned int GLUE_FUNC_ENTER[1];
  13. const static unsigned int GLUE_FUNC_LEAVE[1];
  14. // x86
  15. // stack is 16 byte aligned
  16. // when pushing values to stack, alignment pushed first, then value (value is at the lower address)
  17. // when pushing pointers to stack, alignment pushed first, then pointer (pointer is at the lower address)
  18. static const unsigned char GLUE_PUSH_P1PTR_AS_VALUE[] =
  19. {
  20. 0x83, 0xEC, 8, /* sub esp, 8 */
  21. 0xff, 0x70, 0x4, /* push dword [eax+4] */
  22. 0xff, 0x30, /* push dword [eax] */
  23. };
  24. static int GLUE_POP_VALUE_TO_ADDR(unsigned char *buf, void *destptr)
  25. {
  26. if (buf)
  27. {
  28. *buf++ = 0xB8; *(void **) buf = destptr; buf+=4; // mov eax, directvalue
  29. *buf++ = 0x8f; *buf++ = 0x00; // pop dword [eax]
  30. *buf++ = 0x8f; *buf++ = 0x40; *buf++ = 4; // pop dword [eax+4]
  31. *buf++ = 0x59; // pop ecx (alignment)
  32. *buf++ = 0x59; // pop ecx (alignment)
  33. }
  34. return 12;
  35. }
  36. static int GLUE_COPY_VALUE_AT_P1_TO_PTR(unsigned char *buf, void *destptr)
  37. {
  38. if (buf)
  39. {
  40. *buf++ = 0x8B; *buf++ = 0x38; // mov edi, [eax]
  41. *buf++ = 0x8B; *buf++ = 0x48; *buf++ = 0x04; // mov ecx, [eax+4]
  42. *buf++ = 0xB8; *(void **) buf = destptr; buf+=4; // mov eax, directvalue
  43. *buf++ = 0x89; *buf++ = 0x38; // mov [eax], edi
  44. *buf++ = 0x89; *buf++ = 0x48; *buf++ = 0x04; // mov [eax+4], ecx
  45. }
  46. return 2 + 3 + 5 + 2 + 3;
  47. }
  48. static int GLUE_POP_FPSTACK_TO_PTR(unsigned char *buf, void *destptr)
  49. {
  50. if (buf)
  51. {
  52. *buf++ = 0xB8; *(void **) buf = destptr; buf+=4; // mov eax, directvalue
  53. *buf++ = 0xDD; *buf++ = 0x18; // fstp qword [eax]
  54. }
  55. return 1+4+2;
  56. }
  57. #define GLUE_MOV_PX_DIRECTVALUE_SIZE 5
  58. #define GLUE_MOV_PX_DIRECTVALUE_TOSTACK_SIZE 6 // length when wv == -1
  59. static void GLUE_MOV_PX_DIRECTVALUE_GEN(void *b, INT_PTR v, int wv)
  60. {
  61. if (wv==-1)
  62. {
  63. const static unsigned char t[2] = {0xDD, 0x05};
  64. memcpy(b,t,2);
  65. b= ((unsigned char *)b)+2;
  66. }
  67. else
  68. {
  69. const static unsigned char tab[3] = {
  70. 0xB8 /* mov eax, dv*/,
  71. 0xBF /* mov edi, dv */ ,
  72. 0xB9 /* mov ecx, dv */
  73. };
  74. *((unsigned char *)b) = tab[wv]; // mov eax, dv
  75. b= ((unsigned char *)b)+1;
  76. }
  77. *(INT_PTR *)b = v;
  78. }
  79. const static unsigned char GLUE_PUSH_P1[4]={0x83, 0xEC, 12, 0x50}; // sub esp, 12, push eax
  80. #define GLUE_STORE_P1_TO_STACK_AT_OFFS_SIZE(x) 7
  81. static void GLUE_STORE_P1_TO_STACK_AT_OFFS(void *b, int offs)
  82. {
  83. ((unsigned char *)b)[0] = 0x89; // mov [esp+offs], eax
  84. ((unsigned char *)b)[1] = 0x84;
  85. ((unsigned char *)b)[2] = 0x24;
  86. *(int *)((unsigned char *)b+3) = offs;
  87. }
  88. #define GLUE_MOVE_PX_STACKPTR_SIZE 2
  89. static void GLUE_MOVE_PX_STACKPTR_GEN(void *b, int wv)
  90. {
  91. static const unsigned char tab[3][GLUE_MOVE_PX_STACKPTR_SIZE]=
  92. {
  93. { 0x89, 0xe0 }, // mov eax, esp
  94. { 0x89, 0xe7 }, // mov edi, esp
  95. { 0x89, 0xe1 }, // mov ecx, esp
  96. };
  97. memcpy(b,tab[wv],GLUE_MOVE_PX_STACKPTR_SIZE);
  98. }
  99. #define GLUE_MOVE_STACK_SIZE 6
  100. static void GLUE_MOVE_STACK(void *b, int amt)
  101. {
  102. ((unsigned char *)b)[0] = 0x81;
  103. if (amt <0)
  104. {
  105. ((unsigned char *)b)[1] = 0xEC;
  106. *(int *)((char*)b+2) = -amt; // sub esp, -amt
  107. }
  108. else
  109. {
  110. ((unsigned char *)b)[1] = 0xc4;
  111. *(int *)((char*)b+2) = amt; // add esp, amt
  112. }
  113. }
  114. #define GLUE_POP_PX_SIZE 4
  115. static void GLUE_POP_PX(void *b, int wv)
  116. {
  117. static const unsigned char tab[3][GLUE_POP_PX_SIZE]=
  118. {
  119. {0x58,/*pop eax*/ 0x83, 0xC4, 12 /* add esp, 12*/},
  120. {0x5F,/*pop edi*/ 0x83, 0xC4, 12},
  121. {0x59,/*pop ecx*/ 0x83, 0xC4, 12},
  122. };
  123. memcpy(b,tab[wv],GLUE_POP_PX_SIZE);
  124. }
  125. #define GLUE_SET_PX_FROM_P1_SIZE 2
  126. static void GLUE_SET_PX_FROM_P1(void *b, int wv)
  127. {
  128. static const unsigned char tab[3][GLUE_SET_PX_FROM_P1_SIZE]={
  129. {0x90,0x90}, // should never be used! (nopnop)
  130. {0x89,0xC7}, // mov edi, eax
  131. {0x89,0xC1}, // mov ecx, eax
  132. };
  133. memcpy(b,tab[wv],GLUE_SET_PX_FROM_P1_SIZE);
  134. }
  135. #define GLUE_POP_FPSTACK_SIZE 2
  136. static const unsigned char GLUE_POP_FPSTACK[2] = { 0xDD, 0xD8 }; // fstp st0
  137. static const unsigned char GLUE_POP_FPSTACK_TOSTACK[] = {
  138. 0x83, 0xEC, 16, // sub esp, 16
  139. 0xDD, 0x1C, 0x24 // fstp qword (%esp)
  140. };
  141. static const unsigned char GLUE_POP_STACK_TO_FPSTACK[] = {
  142. 0xDD, 0x04, 0x24, // fld qword (%esp)
  143. 0x83, 0xC4, 16 // add esp, 16
  144. };
  145. static const unsigned char GLUE_POP_FPSTACK_TO_WTP[] = {
  146. 0xDD, 0x1E, /* fstp qword [esi] */
  147. 0x83, 0xC6, 8, /* add esi, 8 */
  148. };
  149. #define GLUE_SET_PX_FROM_WTP_SIZE 2
  150. static void GLUE_SET_PX_FROM_WTP(void *b, int wv)
  151. {
  152. static const unsigned char tab[3][GLUE_SET_PX_FROM_WTP_SIZE]={
  153. {0x89,0xF0}, // mov eax, esi
  154. {0x89,0xF7}, // mov edi, esi
  155. {0x89,0xF1}, // mov ecx, esi
  156. };
  157. memcpy(b,tab[wv],GLUE_SET_PX_FROM_WTP_SIZE);
  158. }
  159. #define GLUE_PUSH_VAL_AT_PX_TO_FPSTACK_SIZE 2
  160. static void GLUE_PUSH_VAL_AT_PX_TO_FPSTACK(void *b, int wv)
  161. {
  162. static const unsigned char tab[3][GLUE_PUSH_VAL_AT_PX_TO_FPSTACK_SIZE]={
  163. {0xDD,0x00}, // fld qword [eax]
  164. {0xDD,0x07}, // fld qword [edi]
  165. {0xDD,0x01}, // fld qword [ecx]
  166. };
  167. memcpy(b,tab[wv],GLUE_PUSH_VAL_AT_PX_TO_FPSTACK_SIZE);
  168. }
  169. #define GLUE_POP_FPSTACK_TO_WTP_TO_PX_SIZE (GLUE_SET_PX_FROM_WTP_SIZE + sizeof(GLUE_POP_FPSTACK_TO_WTP))
  170. static void GLUE_POP_FPSTACK_TO_WTP_TO_PX(unsigned char *buf, int wv)
  171. {
  172. GLUE_SET_PX_FROM_WTP(buf,wv);
  173. memcpy(buf + GLUE_SET_PX_FROM_WTP_SIZE,GLUE_POP_FPSTACK_TO_WTP,sizeof(GLUE_POP_FPSTACK_TO_WTP));
  174. };
  175. const static unsigned char GLUE_RET=0xC3;
  176. static int GLUE_RESET_WTP(unsigned char *out, void *ptr)
  177. {
  178. if (out)
  179. {
  180. *out++ = 0xBE; // mov esi, constant
  181. memcpy(out,&ptr,sizeof(void *));
  182. out+=sizeof(void *);
  183. }
  184. return 1+sizeof(void *);
  185. }
  186. #ifdef _MSC_VER
  187. #pragma warning(push)
  188. #pragma warning(disable: 4731)
  189. #endif
  190. #define GLUE_TABPTR_IGNORED
  191. #define GLUE_CALL_CODE(bp, cp, rt) do { \
  192. if (h->compile_flags&NSEEL_CODE_COMPILE_FLAG_NOFPSTATE) eel_callcode32_fast(cp, rt); \
  193. else eel_callcode32(cp, rt);\
  194. } while(0)
  195. static void eel_callcode32(INT_PTR cp, INT_PTR ramptr)
  196. {
  197. #ifndef NSEEL_EEL1_COMPAT_MODE
  198. short oldsw, newsw;
  199. #endif
  200. #ifdef _MSC_VER
  201. __asm
  202. {
  203. #ifndef NSEEL_EEL1_COMPAT_MODE
  204. fnstcw [oldsw]
  205. mov ax, [oldsw]
  206. or ax, 0xE3F // 53 or 64 bit precision (depending on whether 0x100 is set), trunc, and masking all exceptions
  207. mov [newsw], ax
  208. fldcw [newsw]
  209. #endif
  210. mov eax, cp
  211. mov ebx, ramptr
  212. pushad
  213. mov ebp, esp
  214. and esp, -16
  215. // on win32, which _MSC_VER implies, we keep things aligned to 16 bytes, and if we call a win32 function,
  216. // the stack is 16 byte aligned before the call, meaning that if calling a function with no frame pointer,
  217. // the stack would be aligned to a 16 byte boundary +4, which isn't good for performance. Having said that,
  218. // normally we compile with frame pointers (which brings that to 16 byte + 8, which is fine), or ICC, which
  219. // for nontrivial functions will align the stack itself (for very short functions, it appears to weigh the
  220. // cost of aligning the stack vs that of the slower misaligned double accesses).
  221. // it may be worthwhile (at some point) to put some logic in the code that calls out to functions
  222. // (generic1parm etc) to detect which alignment would be most optimal.
  223. sub esp, 12
  224. call eax
  225. mov esp, ebp
  226. popad
  227. #ifndef NSEEL_EEL1_COMPAT_MODE
  228. fldcw [oldsw]
  229. #endif
  230. };
  231. #else // gcc x86
  232. __asm__(
  233. #ifndef NSEEL_EEL1_COMPAT_MODE
  234. "fnstcw %2\n"
  235. "movw %2, %%ax\n"
  236. "orw $0xE3F, %%ax\n" // 53 or 64 bit precision (depending on whether 0x100 is set), trunc, and masking all exceptions
  237. "movw %%ax, %3\n"
  238. "fldcw %3\n"
  239. #endif
  240. "pushl %%ebx\n"
  241. "movl %%ecx, %%ebx\n"
  242. "pushl %%ebp\n"
  243. "movl %%esp, %%ebp\n"
  244. "andl $-16, %%esp\n" // align stack to 16 bytes
  245. "subl $12, %%esp\n" // call will push 4 bytes on stack, align for that
  246. "call *%%edx\n"
  247. "leave\n"
  248. "popl %%ebx\n"
  249. #ifndef NSEEL_EEL1_COMPAT_MODE
  250. "fldcw %2\n"
  251. #endif
  252. ::
  253. "d" (cp), "c" (ramptr)
  254. #ifndef NSEEL_EEL1_COMPAT_MODE
  255. , "m" (oldsw), "m" (newsw)
  256. #endif
  257. : "%eax","%esi","%edi");
  258. #endif //gcc x86
  259. }
  260. void eel_enterfp(int s[2])
  261. {
  262. #ifdef _MSC_VER
  263. __asm
  264. {
  265. mov ecx, s
  266. fnstcw [ecx]
  267. mov ax, [ecx]
  268. or ax, 0xE3F // 53 or 64 bit precision (depending on whether 0x100 is set), trunc, and masking all exceptions
  269. mov [ecx+4], ax
  270. fldcw [ecx+4]
  271. };
  272. #else
  273. __asm__(
  274. "fnstcw (%%ecx)\n"
  275. "movw (%%ecx), %%ax\n"
  276. "orw $0xE3F, %%ax\n" // 53 or 64 bit precision (depending on whether 0x100 is set), trunc, and masking all exceptions
  277. "movw %%ax, 4(%%ecx)\n"
  278. "fldcw 4(%%ecx)\n"
  279. :: "c" (s) : "%eax");
  280. #endif
  281. }
  282. void eel_leavefp(int s[2])
  283. {
  284. #ifdef _MSC_VER
  285. __asm
  286. {
  287. mov ecx, s
  288. fldcw [ecx]
  289. };
  290. #else
  291. __asm__(
  292. "fldcw (%%ecx)\n"
  293. :: "c" (s) : "%eax");
  294. #endif
  295. }
  296. static void eel_callcode32_fast(INT_PTR cp, INT_PTR ramptr)
  297. {
  298. #ifdef _MSC_VER
  299. __asm
  300. {
  301. mov eax, cp
  302. mov ebx, ramptr
  303. pushad
  304. mov ebp, esp
  305. and esp, -16
  306. // on win32, which _MSC_VER implies, we keep things aligned to 16 bytes, and if we call a win32 function,
  307. // the stack is 16 byte aligned before the call, meaning that if calling a function with no frame pointer,
  308. // the stack would be aligned to a 16 byte boundary +4, which isn't good for performance. Having said that,
  309. // normally we compile with frame pointers (which brings that to 16 byte + 8, which is fine), or ICC, which
  310. // for nontrivial functions will align the stack itself (for very short functions, it appears to weigh the
  311. // cost of aligning the stack vs that of the slower misaligned double accesses).
  312. // it may be worthwhile (at some point) to put some logic in the code that calls out to functions
  313. // (generic1parm etc) to detect which alignment would be most optimal.
  314. sub esp, 12
  315. call eax
  316. mov esp, ebp
  317. popad
  318. };
  319. #else // gcc x86
  320. __asm__(
  321. "pushl %%ebx\n"
  322. "movl %%ecx, %%ebx\n"
  323. "pushl %%ebp\n"
  324. "movl %%esp, %%ebp\n"
  325. "andl $-16, %%esp\n" // align stack to 16 bytes
  326. "subl $12, %%esp\n" // call will push 4 bytes on stack, align for that
  327. "call *%%edx\n"
  328. "leave\n"
  329. "popl %%ebx\n"
  330. ::
  331. "d" (cp), "c" (ramptr)
  332. : "%eax","%esi","%edi");
  333. #endif //gcc x86
  334. }
  335. #ifdef _MSC_VER
  336. #pragma warning(pop)
  337. #endif
  338. static unsigned char *EEL_GLUE_set_immediate(void *_p, INT_PTR newv)
  339. {
  340. char *p=(char*)_p;
  341. INT_PTR scan = 0xFEFEFEFE;
  342. while (*(INT_PTR *)p != scan) p++;
  343. *(INT_PTR *)p = newv;
  344. return (unsigned char *) (((INT_PTR*)p)+1);
  345. }
  346. #define INT_TO_LECHARS(x) ((x)&0xff),(((x)>>8)&0xff), (((x)>>16)&0xff), (((x)>>24)&0xff)
  347. #define GLUE_INLINE_LOOPS
  348. static const unsigned char GLUE_LOOP_LOADCNT[]={
  349. 0xDB, 0x1E, //fistp dword [esi]
  350. 0x8B, 0x0E, // mov ecx, [esi]
  351. 0x81, 0xf9, 1,0,0,0, // cmp ecx, 1
  352. 0x0F, 0x8C, 0,0,0,0, // JL <skipptr>
  353. };
  354. #if NSEEL_LOOPFUNC_SUPPORT_MAXLEN > 0
  355. #define GLUE_LOOP_CLAMPCNT_SIZE sizeof(GLUE_LOOP_CLAMPCNT)
  356. static const unsigned char GLUE_LOOP_CLAMPCNT[]={
  357. 0x81, 0xf9, INT_TO_LECHARS(NSEEL_LOOPFUNC_SUPPORT_MAXLEN), // cmp ecx, NSEEL_LOOPFUNC_SUPPORT_MAXLEN
  358. 0x0F, 0x8C, 5,0,0,0, // JL over-the-mov
  359. 0xB9, INT_TO_LECHARS(NSEEL_LOOPFUNC_SUPPORT_MAXLEN), // mov ecx, NSEEL_LOOPFUNC_SUPPORT_MAXLEN
  360. };
  361. #else
  362. #define GLUE_LOOP_CLAMPCNT_SIZE 0
  363. #define GLUE_LOOP_CLAMPCNT ""
  364. #endif
  365. #define GLUE_LOOP_BEGIN_SIZE sizeof(GLUE_LOOP_BEGIN)
  366. static const unsigned char GLUE_LOOP_BEGIN[]={
  367. 0x56, //push esi
  368. 0x51, // push ecx
  369. 0x81, 0xEC, 0x08, 0,0,0, // sub esp, 8
  370. };
  371. static const unsigned char GLUE_LOOP_END[]={
  372. 0x81, 0xC4, 0x08, 0,0,0, // add esp, 8
  373. 0x59, //pop ecx
  374. 0x5E, // pop esi
  375. 0x49, // dec ecx
  376. 0x0f, 0x85, 0,0,0,0, // jnz ...
  377. };
  378. #if NSEEL_LOOPFUNC_SUPPORT_MAXLEN > 0
  379. #define GLUE_WHILE_SETUP_SIZE sizeof(GLUE_WHILE_SETUP)
  380. static const unsigned char GLUE_WHILE_SETUP[]={
  381. 0xB9, INT_TO_LECHARS(NSEEL_LOOPFUNC_SUPPORT_MAXLEN), // mov ecx, NSEEL_LOOPFUNC_SUPPORT_MAXLEN
  382. };
  383. static const unsigned char GLUE_WHILE_BEGIN[]={
  384. 0x56, //push esi
  385. 0x51, // push ecx
  386. 0x81, 0xEC, 0x08, 0,0,0, // sub esp, 8
  387. };
  388. static const unsigned char GLUE_WHILE_END[]={
  389. 0x81, 0xC4, 0x08, 0,0,0, // add esp, 8
  390. 0x59, //pop ecx
  391. 0x5E, // pop esi
  392. 0x49, // dec ecx
  393. 0x0f, 0x84, 0,0,0,0, // jz endpt
  394. };
  395. #else
  396. #define GLUE_WHILE_SETUP_SIZE 0
  397. #define GLUE_WHILE_SETUP ""
  398. #define GLUE_WHILE_END_NOJUMP
  399. static const unsigned char GLUE_WHILE_BEGIN[]={
  400. 0x56, //push esi
  401. 0x81, 0xEC, 12, 0,0,0, // sub esp, 12
  402. };
  403. static const unsigned char GLUE_WHILE_END[]={
  404. 0x81, 0xC4, 12, 0,0,0, // add esp, 12
  405. 0x5E, // pop esi
  406. };
  407. #endif
  408. static const unsigned char GLUE_WHILE_CHECK_RV[] = {
  409. 0x85, 0xC0, // test eax, eax
  410. 0x0F, 0x85, 0,0,0,0 // jnz looppt
  411. };
  412. static const unsigned char GLUE_SET_P1_Z[] = { 0x29, 0xC0 }; // sub eax, eax
  413. static const unsigned char GLUE_SET_P1_NZ[] = { 0xb0, 0x01 }; // mov al, 1
  414. #define GLUE_HAS_FXCH
  415. static const unsigned char GLUE_FXCH[] = {0xd9, 0xc9};
  416. #define GLUE_HAS_FLDZ
  417. static const unsigned char GLUE_FLDZ[] = {0xd9, 0xee};
  418. #define GLUE_HAS_FLD1
  419. static const unsigned char GLUE_FLD1[] = {0xd9, 0xe8};
  420. static EEL_F negativezeropointfive=-0.5f;
  421. static EEL_F onepointfive=1.5f;
  422. #define GLUE_INVSQRT_NEEDREPL &negativezeropointfive, &onepointfive,
  423. #define GLUE_HAS_NATIVE_TRIGSQRTLOG
  424. static void *GLUE_realAddress(void *fn, void *fn_e, int *size)
  425. {
  426. static const unsigned char sig[12] = { 0x89, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90 };
  427. unsigned char *p = (unsigned char *)fn;
  428. #if defined(_DEBUG) && defined(_MSC_VER)
  429. if (*p == 0xE9) // this means jump to the following address (debug stub)
  430. {
  431. p += 5 + *(int *)(p+1);
  432. }
  433. #endif
  434. while (memcmp(p,sig,sizeof(sig))) p++;
  435. p+=sizeof(sig);
  436. fn = p;
  437. while (memcmp(p,sig,sizeof(sig))) p++;
  438. *size = p - (unsigned char *)fn;
  439. return fn;
  440. }
  441. #endif