汇编优化问题

2019-04-01  本文已影响0人  Zparkle

1.循环计算字符串长度中的代码耗时问题

在用gdb阅读bomblab 问题1时,阅读了一下String_length函数,有个疑问

String_length:
    cmpb  $0x0,(%rdi)  ;judge if the string is null
    je d
    mov  %rdi,%rdx ;move the firsd attr to local postion
    add  $0x1,%rdx ;move pos to next byte
    mov   %edx,%eax;set the return value to now postion
    sub  %edi,%eax;you know that 'edi' is the half of 'rdi' which save where the string begin. And after this, you can see the length in 'eax' register
    cmpb  $0x0,(%rdx);judge if at the tail of the string
    jne ; if not equal, return to add and make loop
    repz retq; is equal,means the data in 'eax' is the final length of the string
    mov  $0x0,%eax;set the return value to 0 if there is no string at the input attr
    retq
    

So the question at the middle of this function. You can see the 5,6th sentence of String_length function. To calculate the length of the input string, the function use 1 mov and 1 sub to calculate the length between tail and head. But why we don't just set $0x0 to the return value %rax and add one at each step? I think to cal sub, computer will do more than just call add.(取反加一再加?)
So why can't we modify the code like the follow one?

String_length:
    cmpb  $0x0,(%rdi)
    je d
    mov  %rdi,%rdx 
    mov $0x0,%eax;return value set to 0
here:
    add  $0x1,%rdx 
    add  $0x1,%eax
    cmpb  $0x0,(%rdx)
    jne (goto here)
    repz retq
    mov  $0x0,%eax
    retq
    

They are both have 11 sentences and I will do the test if the second one run faster.(I wish so)

2019/4/2 I do the test

the source code of my test program as below:

#include<stdio.h>
#include<time.h>
int String_length_2(char* str){
        int length = 0;
        __asm__
        __volatile__("cmpb $0x0,(%%rbx);\n\t\
        je equal_2;\n\t\
        mov %%rbx,%%rdx;\n\t\
        loop_2:\n\t\
        add $0x1,%%rdx;\n\t\
        mov %%edx,%%ecx;\n\t\
        sub %%ebx,%%ecx;\n\t\
        cmpb $0x0,(%%rdx);\n\t\
        jne loop_2;\n\t\
        jmp end_2;\n\t\
        equal_2:\n\t\
        mov $0x0,%0;\n\t\
        end_2:":"=c"(length):"b"(str));
        return length;
}
int String_length(char* str){
        int length = 0;
        __asm__
        __volatile__("cmpb $0x0,(%1);\n\t\
        je equal;\n\t\
        mov $0x0,%0;\n\t\
        loop:\n\t\
        add $0x1,%1;\n\t\
        add $0x1,%0;\n\t\
        cmpb $0x0,(%1);\n\t\
        jne loop;\n\t\
        jmp end;\n\t\
        equal:\n\t\
        mov $0x0,%0;\n\t\
        end:":"=r"(length):"b"(str));
        return length;
}
int main(){
        char str[] = "nice!";
        char str2[] = "";
        int times,length;
        clock_t start,end;
        start = clock();
        for(times = 200000000; times>0; times--){
                length = String_length(str);
        }
        end = clock();
        printf("my_func time consume=%ld\nand length=%d\n",(end-start),length);
        start = clock();
        for(times = 200000000;times>0;times--){
                length = String_length_2(str);
        }
        end = clock();
        printf("origin_func time consume=%ld\nand length=%d\n",(end-start),length);
}

Unfortunately, the result is:

zhuangh7@LAPTOP-BK6LH6G7:/mnt/c/bomb./hello my_func time consume=875000 and length=5 origin_func time consume=812500 and length=5 zhuangh7@LAPTOP-BK6LH6G7:/mnt/c/bomb ./hello
my_func time consume=859375
and length=5
origin_func time consume=828125
and length=5

I have no idea why the origin function perform better now. Wish someone can give me a hand.

30 minutes later


马腿还是牛逼啊……
立即数参与的运算更慢一些,所以我把代码改成了:

int String_length(char* str){
        int length = 0;
        __asm__
        __volatile__("cmpb $0x0,(%1);\n\t\
        je equal;\n\t\
        mov $0x0,%0;\n\t\
        mov $0x1,%%rdx\n\t\
        loop:\n\t\
        add $0x1,%1;\n\t\
        add %%edx,%0;\n\t\
        cmpb $0x0,(%1);\n\t\
        jne loop;\n\t\
        jmp end;\n\t\
        equal:\n\t\
        mov $0x0,%0;\n\t\
        end:":"=r"(length):"b"(str));
        return length;
}

把立即数1赋值到某个寄存器rdx上,然后在循环中调用寄存器相加。
结果:

my_func time consume=875000
and length=5
origin_func time consume=968750
and length=5

就很棒。下一个问题他到底是怎么搞出的这种奇葩代码计算字符串长度的,C语言源代码时什么,编译器又是如何得出这种代码的,没有任何思路,不再讨论。

上一篇下一篇

猜你喜欢

热点阅读