接下来,init_reg_sets_1评估不同类别寄存器间移动数据的代价,而后是评估寄存器和内存之间。
init_reg_sets_1 (continue)
477 /* Initialize the move cost table. Find every subset of each class
478 and take the maximum cost of moving any subset to any other. */
479
480 for (m = 0; m < (unsigned int) MAX_MACHINE_MODE; m++)
481 if (allocatable_regs_of_mode [m])
482 {
483 for (i = 0; i < N_REG_CLASSES; i++)
484 if (contains_reg_of_mode [i][m])
485 for (j = 0; j < N_REG_CLASSES; j++)
486 {
487 int cost;
488 enum reg_class *p1, *p2;
489
490 if (!contains_reg_of_mode [j][m])
491 {
492 move_cost[m][i][j] = 65536;
493 may_move_in_cost[m][i][j] = 65536;
494 may_move_out_cost[m][i][j] = 65536;
495 }
496 else
497 {
498 cost = REGISTER_MOVE_COST (m, i, j);
499
500 for (p2 = ®_class_subclasses[j][0];
501 *p2 != LIM_REG_CLASSES;
502 p2++)
503 if (*p2 != i && contains_reg_of_mode [*p2][m])
504 cost = MAX (cost, move_cost [m][i][*p2]);
505
506 for (p1 = ®_class_subclasses[i][0];
507 *p1 != LIM_REG_CLASSES;
508 p1++)
509 if (*p1 != j && contains_reg_of_mode [*p1][m])
510 cost = MAX (cost, move_cost [m][*p1][j]);
511
512 move_cost[m][i][j] = cost;
513
514 if (reg_class_subset_p (i, j))
515 may_move_in_cost[m][i][j] = 0;
516 else
517 may_move_in_cost[m][i][j] = cost;
518
519 if (reg_class_subset_p (j, i))
520 may_move_out_cost[m][i][j] = 0;
521 else
522 may_move_out_cost[m][i][j] = cost;
523 }
524 }
525 else
526 for (j = 0; j < N_REG_CLASSES; j++)
527 {
528 move_cost[m][i][j] = 65536;
529 may_move_in_cost[m][i][j] = 65536;
530 may_move_out_cost[m][i][j] = 65536;
531 }
532 }
533 }
毫无疑问,在寄存器间移动数据,根据目标寄存器的不同,会有不同的代价。有些移动是不可能的,因为目标寄存器不允许,而有些移动需要临时的内存。上面,move_cost用于记录从一个类别的寄存器移动数据到另一个类别寄存器的最大代价。对于允许的移动,宏REGISTER_MOVE_COST评估其代价。
2645 #define REGISTER_MOVE_COST(MODE, CLASS1, CLASS2) / in i386.h
2646 ix86_register_move_cost ((MODE), (CLASS1), (CLASS2))
14880 int
14881 ix86_register_move_cost (enum machine_mode mode, enum reg_class class1, in i386.c
14882 enum reg_class class2)
14883 {
14884 /* In case we require secondary memory, compute cost of the store followed
14885 by load. In order to avoid bad register allocation choices, we need
14886 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
14887
14888 if (ix86_secondary_memory_needed (class1, class2, mode, 0))
14889 {
14890 int cost = 1;
14891
14892 cost += MAX (MEMORY_MOVE_COST (mode, class1, 0),
14893 MEMORY_MOVE_COST (mode, class1, 1));
14894 cost += MAX (MEMORY_MOVE_COST (mode, class2, 0),
14895 MEMORY_MOVE_COST (mode, class2, 1));
14896
14897 /* In case of copying from general_purpose_register we may emit multiple
14898 stores followed by single load causing memory size mismatch stall.
14899 Count this as arbitrarily high cost of 20. */
14900 if (CLASS_MAX_NREGS (class1, mode) > CLASS_MAX_NREGS (class2, mode))
14901 cost += 20;
14902
14903 /* In the case of FP/MMX moves, the registers actually overlap, and we
14904 have to switch modes in order to treat them differently. */
14905 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
14906 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
14907 cost += 20;
14908
14909 return cost;
14910 }
14911
14912 /* Moves between SSE/MMX and integer unit are expensive. */
14913 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
14914 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
14915 return ix86_cost->mmxsse_to_integer;
14916 if (MAYBE_FLOAT_CLASS_P (class1))
14917 return ix86_cost->fp_move;
14918 if (MAYBE_SSE_CLASS_P (class1))
14919 return ix86_cost->sse_move;
14920 if (MAYBE_MMX_CLASS_P (class1))
14921 return ix86_cost->mmx_move;
14922 return 2;
14923 }
上面,在14888行,ix86_secondary_memory_needed检查在这2个寄存器1中移动数据是否需要临时内存。
14852 Int in i386.c
14853 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
14854 enum machine_mode mode, int strict)
14855 {
14856 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
14857 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
14858 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
14859 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
14860 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
14861 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
14862 {
14863 if (strict)
14864 abort ();
14865 else
14866 return 1;
14867 }
14868 return (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2)
14869 || ((SSE_CLASS_P (class1) != SSE_CLASS_P (class2)
14870 || MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
14871 && ((mode != SImode && (mode != DImode || !TARGET_64BIT))
14872 || (!TARGET_INTER_UNIT_MOVES && !optimize_size))));
14873 }
注意到函数最后的参数在这里是0。而且这是该函数唯一被调用的地方。宏MAYBE_FLOAT_CLASS_P检查寄存器的类别是否与FLOAT_REG类别有重合。而宏FLOAT_CLASS_P检查寄存器的类别是否被FLOAT_REG类别所包含。其他宏也是类似的。FLOAT_CLASS_P及MAYBE_FLOAT_CLASS_P的定义如下:
1318 #define MAYBE_FLOAT_CLASS_P(CLASS) / in i386.h
1319 reg_classes_intersect_p ((CLASS), FLOAT_REGS)
1310 #define FLOAT_CLASS_P(CLASS) /
1311 reg_class_subset_p ((CLASS), FLOAT_REGS)
2545 int
2546 reg_classes_intersect_p (enum reg_class c1, enum reg_class c2) in regclass.c
2547 {
2548 HARD_REG_SET c;
2549
2550 if (c1 == c2) return 1;
2551
2552 if (c1 == ALL_REGS || c2 == ALL_REGS)
2553 return 1;
2554
2555 COPY_HARD_REG_SET (c, reg_class_contents[(int) c1]);
2556 AND_HARD_REG_SET (c, reg_class_contents[(int) c2]);
2557
2558 GO_IF_HARD_REG_SUBSET (c, reg_class_contents[(int) NO_REGS], lose);
2559 return 1;
2560
2561 lose:
2562 return 0;
2563 }
2529 int
2530 reg_class_subset_p (enum reg_class c1, enum reg_class c2) in regclass.c
2531 {
2532 if (c1 == c2) return 1;
2533
2534 if (c2 == ALL_REGS)
2535 win:
2536 return 1;
2537 GO_IF_HARD_REG_SUBSET (reg_class_contents[(int) c1],
2538 reg_class_contents[(int) c2],
2539 win);
2540 return 0;
2541 }
那么在ix86_register_move_cost的14888行,如果class1与class2所从属的类别没有重合,或相反;或者如果classes1和class2不属于同一个类别,ix86_secondary_memory_needed将返回true,即在这些寄存器间移动数据需要临时内存。系统首先需要把寄存器1的内容放到内存,然后从内存读入到寄存器2中,因为在这2个寄存器中对这个值的编码是不同的,直接的拷贝是不合适的。因此需要2次计入代价,一次从寄存器1中移出,另一次移入寄存器2。谨慎起见,只考虑每个类别中最大的代价。
2656 #define MEMORY_MOVE_COST(MODE, CLASS, IN) / in i386.h
2657 ix86_memory_move_cost ((MODE), (CLASS), (IN))
14984 int in i386.c
14985 ix86_memory_move_cost (enum machine_mode mode, enum reg_class class, int in)
14986 {
14987 if (FLOAT_CLASS_P (class))
14988 {
14989 int index;
14990 switch (mode)
14991 {
14992 case SFmode:
14993 index = 0;
14994 break;
14995 case DFmode:
14996 index = 1;
14997 break;
14998 case XFmode:
14999 index = 2;
15000 break;
15001 default:
15002 return 100;
15003 }
15004 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
15005 }
15006 if (SSE_CLASS_P (class))
15007 {
15008 int index;
15009 switch (GET_MODE_SIZE (mode))
15010 {
15011 case 4:
15012 index = 0;
15013 break;
15014 case 8:
15015 index = 1;
15016 break;
15017 case 16:
15018 index = 2;
15019 break;
15020 default:
15021 return 100;
15022 }
15023 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
15024 }
15025 if (MMX_CLASS_P (class))
15026 {
15027 int index;
15028 switch (GET_MODE_SIZE (mode))
15029 {
15030 case 4:
15031 index = 0;
15032 break;
15033 case 8:
15034 index = 1;
15035 break;
15036 default:
15037 return 100;
15038 }
15039 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
15040 }
15041 switch (GET_MODE_SIZE (mode))
15042 {
15043 case 1:
15044 if (in)
15045 return (Q_CLASS_P (class) ? ix86_cost->int_load[0]
15046 : ix86_cost->movzbl_load);
15047 else
15048 return (Q_CLASS_P (class) ? ix86_cost->int_store[0]
15049 : ix86_cost->int_store[0] + 4);
15050 break;
15051 case 2:
15052 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
15053 default:
15054 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
15055 if (mode == TFmode)
15056 mode = XFmode;
15057 return ((in ? ix86_cost->int_load[2] : ix86_cost->int_store[2])
15058 * (((int) GET_MODE_SIZE (mode)
15059 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
15060 }
15061 }
这里关键的数据结构是ix86_cost,它记录了特定处理器相关的代价数据。对于x86机器,有如下的定义:
416 static const in i386.c
417 struct processor_costs pentium4_cost = {
418 1, /* cost of an add instruction */
419 1, /* cost of a lea instruction */
420 4, /* variable shift costs */
421 4, /* constant shift costs */
422 {15, 15, 15, 15, 15}, /* cost of starting a multiply */
423 0, /* cost of multiply per each bit set */
424 {56, 56, 56, 56, 56}, /* cost of a divide/mod */
425 1, /* cost of movsx */
426 1, /* cost of movzx */
427 16, /* "large" insn */
428 6, /* MOVE_RATIO */
429 2, /* cost for loading QImode using movzbl */
430 {4, 5, 4}, /* cost of loading integer registers
431 in QImode, HImode and SImode.
432 Relative to reg-reg move (2). */
433 {2, 3, 2}, /* cost of storing integer registers */
434 2, /* cost of reg,reg fld/fst */
435 {2, 2, 6}, /* cost of loading fp registers
436 in SFmode, DFmode and XFmode */
437 {4, 4, 6}, /* cost of loading integer registers */
438 2, /* cost of moving MMX register */
439 {2, 2}, /* cost of loading MMX registers
440 in SImode and DImode */
441 {2, 2}, /* cost of storing MMX registers
442 in SImode and DImode */
443 12, /* cost of moving SSE register */
444 {12, 12, 12}, /* cost of loading SSE registers
445 in SImode, DImode and TImode */
446 {2, 2, 8}, /* cost of storing SSE registers
447 in SImode, DImode and TImode */
448 10, /* MMX or SSE register to integer */
449 64, /* size of prefetch block */
450 6, /* number of parallel prefetches */
451 2, /* Branch cost */
452 5, /* cost of FADD and FSUB insns. */
453 7, /* cost of FMUL instruction. */
454 43, /* cost of FDIV instruction. */
455 2, /* cost of FABS instruction. */
456 2, /* cost of FCHS instruction. */
457 43, /* cost of FSQRT instruction. */
458 };
459
460 const struct processor_costs *ix86_cost = &pentium_cost;
而processor_costs的定义为:
39 struct processor_costs { in i386.h
40 const int add; /* cost of an add instruction */
41 const int lea; /* cost of a lea instruction */
42 const int shift_var; /* variable shift costs */
43 const int shift_const; /* constant shift costs */
44 const int mult_init[5]; /* cost of starting a multiply
45 in QImode, HImode, SImode, DImode, TImode*/
46 const int mult_bit; /* cost of multiply per each bit set */
47 const int divide[5]; /* cost of a divide/mod
48 in QImode, HImode, SImode, DImode, TImode*/
49 int movsx; /* The cost of movsx operation. */
50 int movzx; /* The cost of movzx operation. */
51 const int large_insn; /* insns larger than this cost more */
52 const int move_ratio; /* The threshold of number of scalar
53 memory-to-memory move insns. */
54 const int movzbl_load; /* cost of loading using movzbl */
55 const int int_load[3]; /* cost of loading integer registers
56 in QImode, HImode and SImode relative
57 to reg-reg move (2). */
58 const int int_store[3]; /* cost of storing integer register
59 in QImode, HImode and SImode */
60 const int fp_move; /* cost of reg,reg fld/fst */
61 const int fp_load[3]; /* cost of loading FP register
62 in SFmode, DFmode and XFmode */
63 const int fp_store[3]; /* cost of storing FP register
64 in SFmode, DFmode and XFmode */
65 const int mmx_move; /* cost of moving MMX register. */
66 const int mmx_load[2]; /* cost of loading MMX register
67 in SImode and DImode */
68 const int mmx_store[2]; /* cost of storing MMX register
69 in SImode and DImode */
70 const int sse_move; /* cost of moving SSE register. */
71 const int sse_load[3]; /* cost of loading SSE register
72 in SImode, DImode and TImode*/
73 const int sse_store[3]; /* cost of storing SSE register
74 in SImode, DImode and TImode*/
75 const int mmxsse_to_integer; /* cost of moving mmxsse register to
76 integer and vice versa. */
77 const int prefetch_block; /* bytes moved to cache for prefetch. */
78 const int simultaneous_prefetches; /* number of parallel prefetch
79 operations. */
80 const int branch_cost; /* Default value for BRANCH_COST. */
81 const int fadd; /* cost of FADD and FSUB instructions. */
82 const int fmul; /* cost of FMUL instruction. */
83 const int fdiv; /* cost of FDIV instruction. */
84 const int fabs; /* cost of FABS instruction. */
85 const int fchs; /* cost of FCHS instruction. */
86 const int fsqrt; /* cost of FSQRT instruction. */
87 };
从这个定义,可以知道大多数数据需要预先定义,预先提供。要做一个编译器,需要很好地了解CPU。
在ix86_register_move_cost的14892和14894行,把2个方向上代价较高者作为所寻求的代价并存入move_cost。
在14900行,宏CLASS_MAX_NREGS找出类别为CLASS的寄存器,用于模式为MODE的数据时,所需要的最大连续寄存器的数目。从14913行开始,对于不需要临时内存的情况,代价数据可以从ix86_cost得到。在具有整数类别的寄存器间移动数据的代价最小——为2。注意到如果在14888行,ix86_secondary_memory_needed返回false,而且满足14913行的条件,应该使用寄存器MMX或SSE来存放整数(它满足ix86_secondary_memory_needed 14868行的条件)。
在通过REGISTER_MOVE_COST找出寄存器类别间的数据移动的代价后,把涉及类别中,具有最大代价的子类别的代价数据作为该类别的代价。