经过对最新的GotoBlas2内核的分析,dgemm 分解为四个部分:dgemm_beta、dgemm_i*copy、dgemm_kernel、dgemm_o*copy.
目标:以Intel Core2 Duo CPU E8500为平台,重新构建(masm语法)最小的dgemm模块。
1.dgemm_beta完成:C=BETA*C的运算。
先重构dgemm_beta,再给一个简单的c调用实例:
;/*********************************************************************/ ;/* Copyright 2009, 2010 The University of Texas at Austin. */ ;/* All rights reserved. */ ;/* GotoBlas2内核dgemm之dgemm_beta. By G-Spider @2011. */ ;/* ml.exe compiler. */ ;/*********************************************************************/ .686p .model flat,c option casemap :none .code ;C=BETA*C; ;LDC=max{M,N}; ; void __cdecl dgemm_beta(int M,int N,int ,double BETA,int,int,int,int,double *C,int LDC) dgemm_beta proc M = dword ptr 4 ;M N = dword ptr 8 ;N BETA = qword ptr 10h ;BETA _C = dword ptr 28h ;C LDC = dword ptr 2Ch ;LDC push ebp push edi push esi push ebx mov esi, [esp+10h+M] mov ecx, [esp+10h+N] fld [esp+10h+BETA] mov edi, [esp+10h+_C] mov ebp, [esp+10h+LDC] test esi, esi jle L999 test ecx, ecx jle L999 ftst fnstsw ax and ah, 44h je short L201 align 4 L101: mov eax, edi lea edi, [edi+ebp*8] mov edx, esi sar edx, 3 jle short L103 align 4 L102: fst qword ptr [eax] fst qword ptr [eax+8] fst qword ptr [eax+10h] fst qword ptr [eax+18h] fst qword ptr [eax+20h] fst qword ptr [eax+28h] fst qword ptr [eax+30h] fst qword ptr [eax+38h] add eax, 40h dec edx jg short L102 align 4 L103: mov edx, esi and edx, 7 jle short L105 align 4 L104: fst qword ptr [eax] add eax, 8 dec edx jg short L104 align 4 L105: dec ecx jg short L101 jmp L999 L201: mov eax, edi ;c_offset = c lea edi, [edi+ebp*8] ;c += ldc mov edx, esi sar edx, 3 jle short L203 align 4 L202: fld qword ptr [eax] fmul st, st(1) fstp qword ptr [eax] fld qword ptr [eax+8] fmul st, st(1) fstp qword ptr [eax+8] fld qword ptr [eax+10h] fmul st, st(1) fstp qword ptr [eax+10h] fld qword ptr [eax+18h] fmul st, st(1) fstp qword ptr [eax+18h] fld qword ptr [eax+20h] fmul st, st(1) fstp qword ptr [eax+20h] fld qword ptr [eax+28h] fmul st, st(1) fstp qword ptr [eax+28h] fld qword ptr [eax+30h] fmul st, st(1) fstp qword ptr [eax+30h] fld qword ptr [eax+38h] fmul st, st(1) fstp qword ptr [eax+38h] add eax, 40h dec edx jg short L202 align 4 L203: mov edx, esi and edx, 7 jle short L205 align 4 L204: fld qword ptr [eax] fmul st, st(1) fstp qword ptr [eax] add eax, 8 dec edx jg short L204 align 4 L205: dec ecx jg L201 nop L999: ffreep st xor eax, eax pop ebx pop esi pop edi pop ebp ret dgemm_beta endp end