64 bit SSE mod
git-svn-id: svn://localhost/trunk/ardour2@308 d708f5d6-7413-0410-9779-e7cbd77b26cf
This commit is contained in:
53
SConstruct
53
SConstruct
@@ -23,20 +23,17 @@ subst_dict = { }
|
||||
|
||||
opts = Options('scache.conf')
|
||||
opts.AddOptions(
|
||||
BoolOption('ALTIVEC', 'Compile using Altivec instructions', 0),
|
||||
('ARCH', 'Set architecture-specific compilation flags by hand (all flags as 1 argument)',''),
|
||||
BoolOption('SYSLIBS', 'USE AT YOUR OWN RISK: CANCELS ALL SUPPORT FROM ARDOUR AUTHORS: Use existing system versions of various libraries instead of internal ones', 0),
|
||||
BoolOption('DEBUG', 'Set to build with debugging information and no optimizations', 0),
|
||||
PathOption('DESTDIR', 'Set the intermediate install "prefix"', '/'),
|
||||
BoolOption('DEVBUILD', 'Use shared libardour (developers only)', 0),
|
||||
BoolOption('NLS', 'Set to turn on i18n support', 1),
|
||||
BoolOption('NOARCH', 'Do not use architecture-specific compilation flags', 0),
|
||||
PathOption('PREFIX', 'Set the install "prefix"', '/usr/local'),
|
||||
BoolOption('VST', 'Compile with support for VST', 0),
|
||||
BoolOption('VERSIONED', 'Add version information to ardour/gtk executable name inside the build directory', 0),
|
||||
BoolOption('USE_SSE_EVERYWHERE', 'Ask the compiler to use x86/SSE instructions and also our hand-written x86/SSE optimizations when possible (off by default)', 0),
|
||||
BoolOption('BUILD_SSE_OPTIMIZATIONS', 'Use our hand-written x86/SSE optimizations when possible (off by default)', 0),
|
||||
BoolOption('BUILD_VECLIB_OPTIMIZATIONS', 'Build with Apple Accelerate/vecLib optimizations when possible (off by default)', 0)
|
||||
EnumOption('DIST_TARGET', 'Build target for cross compiling packagers', 'i386', allowed_values=('none', 'tiger', 'panther', 'i686', 'x86_64', 'i386'), ignorecase=2),
|
||||
BoolOption('FPU_OPTIMIZATION', 'Build runtime checked assembler code', 1)
|
||||
)
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
@@ -574,7 +571,7 @@ config_os = 3;
|
||||
config = config_guess.split ("-")
|
||||
|
||||
if config[config_arch] == 'apple':
|
||||
if env['BUILD_VECLIB_OPTIMIZATIONS'] == 1:
|
||||
if env['FPU_OPTIMIZATION']:
|
||||
opt_flags.append ("-DBUILD_VECLIB_OPTIMIZATIONS")
|
||||
debug_flags.append ("-DBUILD_VECLIB_OPTIMIZATIONS")
|
||||
libraries['core'].Append(LINKFLAGS= '-framework Accelerate')
|
||||
@@ -585,21 +582,20 @@ if config[config_cpu] == 'powerpc':
|
||||
#
|
||||
# -mcpu=7450 does not reliably work with gcc 3.*
|
||||
#
|
||||
if env['NOARCH'] == 0:
|
||||
if env['ALTIVEC'] == 1:
|
||||
if config[config_arch] == 'apple':
|
||||
opt_flags.extend ([ "-mcpu=7450", "-faltivec"])
|
||||
else:
|
||||
opt_flags.extend ([ "-mcpu=7400", "-maltivec", "-mabi=altivec"])
|
||||
else:
|
||||
opt_flags.extend([ "-mcpu=750", "-mmultiple" ])
|
||||
opt_flags.extend (["-mhard-float", "-mpowerpc-gfxopt"])
|
||||
if env['DIST_TARGET'] == 'panther' or env['DIST_TARGET'] == 'tiger':
|
||||
if config[config_arch] == 'apple':
|
||||
opt_flags.extend ([ "-mcpu=7450", "-faltivec"])
|
||||
else:
|
||||
opt_flags.extend ([ "-mcpu=7400", "-maltivec", "-mabi=altivec"])
|
||||
else:
|
||||
opt_flags.extend([ "-mcpu=750", "-mmultiple" ])
|
||||
opt_flags.extend (["-mhard-float", "-mpowerpc-gfxopt"])
|
||||
|
||||
elif ((re.search ("i[0-9]86", config[config_cpu]) != None) or (re.search ("x86_64", config[config_cpu]) != None)):
|
||||
|
||||
build_host_supports_sse = 0
|
||||
|
||||
if env['NOARCH'] == 0:
|
||||
if env['DIST_TARGET'] != 'none':
|
||||
|
||||
debug_flags.append ("-DARCH_X86")
|
||||
opt_flags.append ("-DARCH_X86")
|
||||
@@ -612,7 +608,7 @@ elif ((re.search ("i[0-9]86", config[config_cpu]) != None) or (re.search ("x86_6
|
||||
if "mmx" in x86_flags:
|
||||
opt_flags.append ("-mmmx")
|
||||
if "sse" in x86_flags:
|
||||
build_host_supports_sse = 1
|
||||
build_host_supports_sse = 1
|
||||
if "3dnow" in x86_flags:
|
||||
opt_flags.append ("-m3dnow")
|
||||
|
||||
@@ -621,17 +617,20 @@ elif ((re.search ("i[0-9]86", config[config_cpu]) != None) or (re.search ("x86_6
|
||||
elif config[config_cpu] == "i686":
|
||||
opt_flags.append ("-march=i686")
|
||||
|
||||
if env['USE_SSE_EVERYWHERE'] == 1:
|
||||
opt_flags.extend (["-msse", "-mfpmath=sse"])
|
||||
debug_flags.extend (["-msse", "-mfpmath=sse"])
|
||||
if build_host_supports_sse != 1:
|
||||
print "\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)"
|
||||
if (env['DIST_TARGET'] == 'i686') or (env['DIST_TARGET'] == 'x86_64'):
|
||||
opt_flags.extend (["-msse", "-mfpmath=sse"])
|
||||
debug_flags.extend (["-msse", "-mfpmath=sse"])
|
||||
if build_host_supports_sse != 1:
|
||||
print "\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)"
|
||||
|
||||
if env['BUILD_SSE_OPTIMIZATIONS'] == 1:
|
||||
opt_flags.append ("-DBUILD_SSE_OPTIMIZATIONS")
|
||||
debug_flags.append ("-DBUILD_SSE_OPTIMIZATIONS")
|
||||
if build_host_supports_sse != 1:
|
||||
print "\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)"
|
||||
if env['FPU_OPTIMIZATION']:
|
||||
opt_flags.append ("-DBUILD_SSE_OPTIMIZATIONS")
|
||||
debug_flags.append ("-DBUILD_SSE_OPTIMIZATIONS")
|
||||
if env['DIST_TARGET'] == 'x86_64':
|
||||
opt_flags.append ("-DUSE_X86_64_ASM")
|
||||
debug_flags.append ("-DUSE_X86_64_ASM")
|
||||
if build_host_supports_sse != 1:
|
||||
print "\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)"
|
||||
|
||||
# end of processor-specific section
|
||||
|
||||
|
||||
@@ -195,12 +195,23 @@ env['BUILDERS']['SharedAsmObject'] = Builder (action = '$CXX -c -fPIC $SOURCE -o
|
||||
single_source = 1)
|
||||
|
||||
if env['DEVBUILD'] == 1:
|
||||
if env['BUILD_SSE_OPTIMIZATIONS'] == 1:
|
||||
arch_specific_objects = env.SharedAsmObject('sse_functions.os', 'sse_functions.s')
|
||||
if env['FPU_OPTIMIZATION']:
|
||||
if env['DIST_TARGET'] == "i386":
|
||||
arch_specific_objects = env.SharedAsmObject('sse_functions.os', 'sse_functions.s')
|
||||
if env['DIST_TARGET'] == "i686":
|
||||
arch_specific_objects = env.SharedAsmObject('sse_functions.os', 'sse_functions.s')
|
||||
if env['DIST_TARGET'] == "x86_64":
|
||||
arch_specific_objects = env.SharedAsmObject('sse_functions_64bit.os', 'sse_functions_64bit.s')
|
||||
|
||||
libardour = ardour.SharedLibrary('ardour', ardour_files + extra_sources + arch_specific_objects)
|
||||
else:
|
||||
if env['BUILD_SSE_OPTIMIZATIONS'] == 1:
|
||||
arch_specific_objects = env.StaticObject(target='sse_functions',source='sse_functions.s')
|
||||
if env['FPU_OPTIMIZATION']:
|
||||
if env['DIST_TARGET'] == "i386":
|
||||
arch_specific_objects = env.StaticObject(target='sse_functions',source='sse_functions.s')
|
||||
if env['DIST_TARGET'] == "i686":
|
||||
arch_specific_objects = env.StaticObject(target='sse_functions',source='sse_functions.s')
|
||||
if env['DIST_TARGET'] == "x86_64":
|
||||
arch_specific_objects = env.StaticObject(target='sse_functions_64bit',source='sse_functions_64bit.s')
|
||||
|
||||
libardour = ardour.StaticLibrary('ardour', ardour_files + extra_sources + arch_specific_objects)
|
||||
|
||||
|
||||
@@ -192,6 +192,7 @@ ARDOUR::init (AudioEngine& engine, bool use_vst, bool try_optimization, void (*s
|
||||
|
||||
unsigned int use_sse = 0;
|
||||
|
||||
#ifndef USE_X86_64_ASM
|
||||
asm volatile (
|
||||
"mov $1, %%eax\n"
|
||||
"pushl %%ebx\n"
|
||||
@@ -202,7 +203,21 @@ ARDOUR::init (AudioEngine& engine, bool use_vst, bool try_optimization, void (*s
|
||||
: "=m" (use_sse)
|
||||
:
|
||||
: "%eax", "%ecx", "%edx", "memory");
|
||||
#else
|
||||
|
||||
asm volatile (
|
||||
"movq $1, %%rax\n"
|
||||
"pushq %%rbx\n"
|
||||
"cpuid\n"
|
||||
"popq %%rbx\n"
|
||||
"andq $33554432, %%rdx\n"
|
||||
"movq %%rdx, %0\n"
|
||||
: "=m" (use_sse)
|
||||
:
|
||||
: "%rax", "%rcx", "%rdx", "memory");
|
||||
|
||||
#endif /* USE_X86_64_ASM */
|
||||
|
||||
if (use_sse) {
|
||||
cerr << "Enabling SSE optimized routines" << endl;
|
||||
|
||||
|
||||
602
libs/ardour/sse_functions_64bit.s
Normal file
602
libs/ardour/sse_functions_64bit.s
Normal file
@@ -0,0 +1,602 @@
|
||||
/*
|
||||
Copyright (C) 2005-2006 Sampo Savolainen, John Rigg
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
$Id$
|
||||
*/
|
||||
|
||||
|
||||
#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
|
||||
|
||||
.globl x86_sse_mix_buffers_with_gain
|
||||
.type x86_sse_mix_buffers_with_gain,@function
|
||||
|
||||
x86_sse_mix_buffers_with_gain:
|
||||
|
||||
#; %rdi float *dst
|
||||
#; %rsi float *src
|
||||
#; %rdx unsigned int nframes
|
||||
#; %xmm0 float gain
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rbx
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %rdx
|
||||
je .MBWG_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rdi, %rax
|
||||
andq $12, %rax #; mask alignment offset
|
||||
|
||||
movq %rsi, %rbx
|
||||
andq $12, %rbx #; mask alignment offset
|
||||
|
||||
cmp %rax, %rbx
|
||||
jne .MBWG_NONALIGN #; if not aligned, calculate manually
|
||||
|
||||
#; if we are aligned
|
||||
cmp $0, %rbx
|
||||
jz .MBWG_SSE
|
||||
|
||||
#; Pre-loop, we need to run 1-3 frames "manually" without
|
||||
#; SSE instructions
|
||||
|
||||
.MBWG_PRELOOP:
|
||||
|
||||
#; gain is already in %xmm0
|
||||
movss (%rsi), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
addss (%rdi), %xmm1
|
||||
movss %xmm1, (%rdi)
|
||||
|
||||
addq $4, %rdi #; dst++
|
||||
addq $4, %rsi #; src++
|
||||
decq %rdx #; nframes--
|
||||
jz .MBWG_END
|
||||
|
||||
addq $4, %rbx
|
||||
|
||||
cmp $16, %rbx #; test if we've reached 16 byte alignment
|
||||
jne .MBWG_PRELOOP
|
||||
|
||||
|
||||
.MBWG_SSE:
|
||||
|
||||
cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
|
||||
jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
|
||||
|
||||
#; gain is already in %xmm0
|
||||
shufps $0x00, %xmm0, %xmm0
|
||||
|
||||
|
||||
.MBWG_SSELOOP:
|
||||
|
||||
movaps (%rsi), %xmm1 #; source => xmm0
|
||||
mulps %xmm0, %xmm1 #; apply gain to source
|
||||
addps (%rdi), %xmm1 #; mix with destination
|
||||
movaps %xmm1, (%rdi) #; copy result to destination
|
||||
|
||||
addq $16, %rdi #; dst+=4
|
||||
addq $16, %rsi #; src+=4
|
||||
|
||||
subq $4, %rdx #; nframes-=4
|
||||
cmp $4, %rdx
|
||||
jge .MBWG_SSELOOP
|
||||
|
||||
cmp $0, %rdx
|
||||
je .MBWG_END
|
||||
|
||||
#; if there are remaining frames, the nonalign code will do nicely
|
||||
#; for the rest 1-3 frames.
|
||||
|
||||
.MBWG_NONALIGN:
|
||||
#; not aligned!
|
||||
|
||||
#; gain is already in %xmm0
|
||||
|
||||
.MBWG_NONALIGNLOOP:
|
||||
|
||||
movss (%rsi), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
addss (%rdi), %xmm1
|
||||
movss %xmm1, (%rdi)
|
||||
|
||||
addq $4, %rdi
|
||||
addq $4, %rsi
|
||||
|
||||
decq %rdx
|
||||
jnz .MBWG_NONALIGNLOOP
|
||||
|
||||
.MBWG_END:
|
||||
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rbx
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
.size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
|
||||
|
||||
|
||||
#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
|
||||
|
||||
.globl x86_sse_mix_buffers_no_gain
|
||||
.type x86_sse_mix_buffers_no_gain,@function
|
||||
|
||||
x86_sse_mix_buffers_no_gain:
|
||||
|
||||
#; %rdi float *dst
|
||||
#; %rsi float *src
|
||||
#; %rdx unsigned int nframes
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rbx
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
|
||||
#; the real function
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %rdx
|
||||
je .MBNG_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rdi, %rax
|
||||
andq $12, %rax #; mask alignment offset
|
||||
|
||||
movq %rsi, %rbx
|
||||
andq $12, %rbx #; mask alignment offset
|
||||
|
||||
cmp %rax, %rbx
|
||||
jne .MBNG_NONALIGN #; if not aligned, calculate manually
|
||||
|
||||
cmp $0, %rbx
|
||||
je .MBNG_SSE
|
||||
|
||||
#; Pre-loop, we need to run 1-3 frames "manually" without
|
||||
#; SSE instructions
|
||||
|
||||
.MBNG_PRELOOP:
|
||||
|
||||
movss (%rsi), %xmm0
|
||||
addss (%rdi), %xmm0
|
||||
movss %xmm0, (%rdi)
|
||||
|
||||
addq $4, %rdi #; dst++
|
||||
addq $4, %rsi #; src++
|
||||
decq %rdx #; nframes--
|
||||
jz .MBNG_END
|
||||
addq $4, %rbx
|
||||
|
||||
cmp $16, %rbx #; test if we've reached 16 byte alignment
|
||||
jne .MBNG_PRELOOP
|
||||
|
||||
.MBNG_SSE:
|
||||
|
||||
cmp $4, %rdx #; if there are frames left, but less than 4
|
||||
jnge .MBNG_NONALIGN #; we can't run SSE
|
||||
|
||||
.MBNG_SSELOOP:
|
||||
|
||||
movaps (%rsi), %xmm0 #; source => xmm0
|
||||
addps (%rdi), %xmm0 #; mix with destination
|
||||
movaps %xmm0, (%rdi) #; copy result to destination
|
||||
|
||||
addq $16, %rdi #; dst+=4
|
||||
addq $16, %rsi #; src+=4
|
||||
|
||||
subq $4, %rdx #; nframes-=4
|
||||
cmp $4, %rdx
|
||||
jge .MBNG_SSELOOP
|
||||
|
||||
cmp $0, %rdx
|
||||
je .MBNG_END
|
||||
|
||||
#; if there are remaining frames, the nonalign code will do nicely
|
||||
#; for the rest 1-3 frames.
|
||||
|
||||
.MBNG_NONALIGN:
|
||||
#; not aligned!
|
||||
|
||||
movss (%rsi), %xmm0 #; src => xmm0
|
||||
addss (%rdi), %xmm0 #; xmm0 += dst
|
||||
movss %xmm0, (%rdi) #; xmm0 => dst
|
||||
|
||||
addq $4, %rdi
|
||||
addq $4, %rsi
|
||||
|
||||
decq %rdx
|
||||
jnz .MBNG_NONALIGN
|
||||
|
||||
.MBNG_END:
|
||||
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rbx
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
.size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
|
||||
|
||||
|
||||
#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
|
||||
|
||||
.globl x86_sse_apply_gain_to_buffer
|
||||
.type x86_sse_apply_gain_to_buffer,@function
|
||||
|
||||
x86_sse_apply_gain_to_buffer:
|
||||
|
||||
#; %rdi float *buf 32(%rbp)
|
||||
#; %rsi unsigned int nframes
|
||||
#; %xmm0 float gain
|
||||
#; %xmm1 float buf[0]
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save %rdi
|
||||
pushq %rdi
|
||||
|
||||
#; the real function
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
movq %rsi, %rcx #; nframes
|
||||
cmp $0, %rcx
|
||||
je .AG_END
|
||||
|
||||
#; set up the gain buffer (gain is already in %xmm0)
|
||||
shufps $0x00, %xmm0, %xmm0
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rdi, %rdx #; buf => %rdx
|
||||
andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
|
||||
jz .AG_SSE #; if buffer IS aligned
|
||||
|
||||
#; PRE-LOOP
|
||||
#; we iterate 1-3 times, doing normal x87 float comparison
|
||||
#; so we reach a 16 byte aligned "buf" (=%rdi) value
|
||||
|
||||
.AGLP_START:
|
||||
|
||||
#; Load next value from the buffer into %xmm1
|
||||
movss (%rdi), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
movss %xmm1, (%rdi)
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rdi #; buf++;
|
||||
|
||||
decq %rcx #; nframes--
|
||||
jz .AG_END #; if we run out of frames, we go to the end
|
||||
|
||||
addq $4, %rdx #; one non-aligned byte less
|
||||
cmp $16, %rdx
|
||||
jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
|
||||
|
||||
.AG_SSE:
|
||||
|
||||
#; We have reached the 16 byte aligned "buf" ("rdi") value
|
||||
|
||||
#; Figure out how many loops we should do
|
||||
movq %rcx, %rax #; copy remaining nframes to %rax for division
|
||||
movq $0, %rdx #; 0 the edx register
|
||||
|
||||
|
||||
pushq %rdi
|
||||
movq $4, %rdi
|
||||
divq %rdi #; %rdx = remainder == 0
|
||||
popq %rdi
|
||||
|
||||
#; %rax = SSE iterations
|
||||
cmp $0, %rax
|
||||
je .AGPOST_START
|
||||
|
||||
|
||||
.AGLP_SSE:
|
||||
|
||||
movaps (%rdi), %xmm1
|
||||
mulps %xmm0, %xmm1
|
||||
movaps %xmm1, (%rdi)
|
||||
|
||||
addq $16, %rdi
|
||||
subq $4, %rcx #; nframes-=4
|
||||
|
||||
decq %rax
|
||||
jnz .AGLP_SSE
|
||||
|
||||
#; Next we need to post-process all remaining frames
|
||||
#; the remaining frame count is in %rcx
|
||||
|
||||
#; if no remaining frames, jump to the end
|
||||
cmp $0, %rcx
|
||||
andq $3, %rcx #; nframes % 4
|
||||
je .AG_END
|
||||
|
||||
.AGPOST_START:
|
||||
|
||||
movss (%rdi), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
movss %xmm1, (%rdi)
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rdi #; buf++;
|
||||
|
||||
decq %rcx #; nframes--
|
||||
jnz .AGPOST_START #; if we run out of frames, we go to the end
|
||||
|
||||
.AG_END:
|
||||
|
||||
|
||||
popq %rdi
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
.size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
|
||||
#; end proc
|
||||
|
||||
|
||||
#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
|
||||
|
||||
.globl x86_sse_apply_gain_vector
|
||||
.type x86_sse_apply_gain_vector,@function
|
||||
|
||||
x86_sse_apply_gain_vector:
|
||||
|
||||
#; %rdi float *buf
|
||||
#; %rsi float *gain_vector
|
||||
#; %rdx unsigned int nframes
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; Save registers
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
pushq %rbx
|
||||
|
||||
#; if nframes == 0 go to end
|
||||
cmp $0, %rdx
|
||||
je .AGA_END
|
||||
|
||||
#; Check alignment
|
||||
movq %rdi, %rax
|
||||
andq $12, %rax
|
||||
|
||||
movq %rsi, %rbx
|
||||
andq $12, %rbx
|
||||
|
||||
cmp %rax,%rbx
|
||||
jne .AGA_ENDLOOP
|
||||
|
||||
cmp $0, %rax
|
||||
jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
|
||||
|
||||
#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
|
||||
.AGA_ALIGNLOOP:
|
||||
|
||||
movss (%rdi), %xmm0 #; buf => xmm0
|
||||
movss (%rsi), %xmm1 #; gain value => xmm1
|
||||
mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
|
||||
movss %xmm0, (%rdi) #; signal with gain => buf
|
||||
|
||||
decq %rdx
|
||||
jz .AGA_END
|
||||
|
||||
addq $4, %rdi #; buf++
|
||||
addq $4, %rsi #; gab++
|
||||
|
||||
addq $4, %rax
|
||||
cmp $16, %rax
|
||||
jne .AGA_ALIGNLOOP
|
||||
|
||||
#; There are frames left for sure, as that is checked in the beginning
|
||||
#; and within the previous loop. BUT, there might be less than 4 frames
|
||||
#; to process
|
||||
|
||||
.AGA_SSE:
|
||||
movq %rdx, %rax #; nframes => %rax
|
||||
shr $2, %rax #; unsigned divide by 4
|
||||
|
||||
cmp $0, %rax #; Jos toimii ilman tätä, niin kiva
|
||||
je .AGA_ENDLOOP
|
||||
|
||||
.AGA_SSELOOP:
|
||||
movaps (%rdi), %xmm0
|
||||
movaps (%rsi), %xmm1
|
||||
mulps %xmm1, %xmm0
|
||||
movaps %xmm0, (%rdi)
|
||||
|
||||
addq $16, %rdi
|
||||
addq $16, %rsi
|
||||
|
||||
decq %rax
|
||||
jnz .AGA_SSELOOP
|
||||
|
||||
andq $3, %rdx #; Remaining frames are nframes & 3
|
||||
jz .AGA_END
|
||||
|
||||
|
||||
#; Inside this loop, we know there are frames left to process
|
||||
#; but because either there are < 4 frames left, or the buffers
|
||||
#; are not aligned, we can't use the parallel SSE ops
|
||||
.AGA_ENDLOOP:
|
||||
movss (%rdi), %xmm0 #; buf => xmm0
|
||||
movss (%rsi), %xmm1 #; gain value => xmm1
|
||||
mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
|
||||
movss %xmm0, (%rdi) #; signal with gain => buf
|
||||
|
||||
addq $4,%rdi
|
||||
addq $4,%rsi
|
||||
decq %rdx #; nframes--
|
||||
jnz .AGA_ENDLOOP
|
||||
|
||||
.AGA_END:
|
||||
|
||||
popq %rbx
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
|
||||
leave
|
||||
ret
|
||||
|
||||
.size x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
|
||||
#; end proc
|
||||
|
||||
|
||||
#; float x86_sse_compute_peak(float *buf, long nframes, float current);
|
||||
|
||||
.globl x86_sse_compute_peak
|
||||
.type x86_sse_compute_peak,@function
|
||||
|
||||
abs_mask:
|
||||
.long 2147483647
|
||||
|
||||
|
||||
x86_sse_compute_peak:
|
||||
|
||||
#; %rdi float *buf 32(%rbp)
|
||||
#; %rsi unsigned int nframes
|
||||
#; %xmm0 float current
|
||||
#; %xmm1 float buf[0]
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save %rdi
|
||||
pushq %rdi
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
movq %rsi, %rcx #; nframes
|
||||
cmp $0, %rcx
|
||||
je .CP_END
|
||||
|
||||
#; create the "abs" mask in %xmm2
|
||||
movss abs_mask, %xmm2
|
||||
shufps $0x00, %xmm2, %xmm2
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
#;movq 8(%rbp), %rdi #; buf
|
||||
movq %rdi, %rdx #; buf => %rdx
|
||||
andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
|
||||
jz .CP_SSE #; if buffer IS aligned
|
||||
|
||||
#; PRE-LOOP
|
||||
#; we iterate 1-3 times, doing normal x87 float comparison
|
||||
#; so we reach a 16 byte aligned "buf" (=%rdi) value
|
||||
|
||||
.LP_START:
|
||||
|
||||
#; Load next value from the buffer
|
||||
movss (%rdi), %xmm1
|
||||
andps %xmm2, %xmm1
|
||||
maxss %xmm1, %xmm0
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rdi #; buf++;
|
||||
|
||||
decq %rcx #; nframes--
|
||||
jz .CP_END #; if we run out of frames, we go to the end
|
||||
|
||||
addq $4, %rdx #; one non-aligned byte less
|
||||
cmp $16, %rdx
|
||||
jne .LP_START #; if more non-aligned frames exist, we do a do-over
|
||||
|
||||
.CP_SSE:
|
||||
|
||||
#; We have reached the 16 byte aligned "buf" ("rdi") value
|
||||
|
||||
#; Figure out how many loops we should do
|
||||
movq %rcx, %rax #; copy remaining nframes to %rax for division
|
||||
|
||||
shr $2,%rax #; unsigned divide by 4
|
||||
jz .POST_START
|
||||
|
||||
#; %rax = SSE iterations
|
||||
|
||||
#; current maximum is at %xmm0, but we need to ..
|
||||
shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
|
||||
|
||||
#;prefetcht0 16(%rdi)
|
||||
|
||||
.LP_SSE:
|
||||
|
||||
movaps (%rdi), %xmm1
|
||||
andps %xmm2, %xmm1
|
||||
maxps %xmm1, %xmm0
|
||||
|
||||
addq $16, %rdi
|
||||
|
||||
decq %rax
|
||||
jnz .LP_SSE
|
||||
|
||||
#; Calculate the maximum value contained in the 4 FP's in %xmm0
|
||||
movaps %xmm0, %xmm1
|
||||
shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
|
||||
maxps %xmm1, %xmm0 #; maximums of the two pairs
|
||||
movaps %xmm0, %xmm1
|
||||
shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
|
||||
maxps %xmm1, %xmm0
|
||||
|
||||
#; now every float in %xmm0 is the same value, current maximum value
|
||||
|
||||
#; Next we need to post-process all remaining frames
|
||||
#; the remaining frame count is in %rcx
|
||||
|
||||
#; if no remaining frames, jump to the end
|
||||
|
||||
andq $3, %rcx #; nframes % 4
|
||||
jz .CP_END
|
||||
|
||||
.POST_START:
|
||||
|
||||
movss (%rdi), %xmm1
|
||||
andps %xmm2, %xmm1
|
||||
maxss %xmm1, %xmm0
|
||||
|
||||
addq $4, %rdi #; buf++;
|
||||
|
||||
decq %rcx #; nframes--;
|
||||
jnz .POST_START
|
||||
|
||||
.CP_END:
|
||||
|
||||
popq %rdi
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
.size x86_sse_compute_peak, .-x86_sse_compute_peak
|
||||
#; end proc
|
||||
Reference in New Issue
Block a user