Something like this;
;Zero the bss
movw $__bss_start, %di ; Get start of BSS in %di register
movw $_end+3, %cx ; Get end of BSS in %cx register
xorl %eax, %eax ; Clear %eax
subw %di, %cx ; Calculate size of BSS (%cx-%di) to %cx
shrw $2, %cx ; Divide %cx by 4
rep stosl ; Repeat %cx times, store %eax (4 bytes of 0) at
; address %di and increase %di by 4.
On the rep stosl
;
rep
is a repeat prefix that will repeat the following instruction (out of a limited set) %cx times.
stosl
stores the value of %eax at the address pointed to by %(e)di, and increases %e(di) by the size of %eax.
As an example, rep stosl
with %eax set to 0, %edi set to 0x4000 and %cx set to 4, will set the memory from 0x4000 to %0x4010 to zero.