diff --git a/CMakeLists.txt b/CMakeLists.txt index e76de31..d3d6ee8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,7 +90,8 @@ endif() ######################################################################## find_package(PythonLibs 3) -find_package(Gnuradio "3.8" REQUIRED runtime fft blocks filter analog digital) +find_package(Gnuradio "3.8" REQUIRED + COMPONENTS runtime blocks fft analog filter digital pmt) include(GrVersion) include(GrPlatform) #define LIB_SUFFIX @@ -148,50 +149,6 @@ find_package(JsonCpp REQUIRED) option(INCLUDE_DEBUG_BLOCKS "Enable/Disable blocks that are used for debugging purposes" ON) - -######################################################################## -# Find gr-satnogs external build dependencies -######################################################################## - -######################################################################## -# Search for the libfec if it is already installed in the system -# If not, install the internal one. -######################################################################## -find_package(Fec) -if(NOT FEC_FOUND) - message(WARNING "libfec is not installed. The internal libfec will be automatically build and install.") - include(ExternalProject) - ExternalProject_Add(FEC_EXTERNAL - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/libfec - BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libfec - CMAKE_ARGS "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" - "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" - INSTALL_COMMAND "" - ) - - ExternalProject_Get_Property(FEC_EXTERNAL binary_dir) - add_library(fec SHARED IMPORTED) - - set_property(TARGET fec PROPERTY IMPORTED_LOCATION ${install_dir}/libfec.so) - - add_dependencies(fec FEC_EXTERNAL) - set(FEC_LIBRARIES "${binary_dir}/libfec.so") - set(FEC_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/libfec") - - # Install the header and the library in the standard places - install(FILES - "${FEC_INCLUDE_DIRS}/fec.h" - DESTINATION "include" - ) - install(FILES - ${FEC_LIBRARIES} - DESTINATION lib${LIB_SUFFIX} - ) -else() - add_library(fec INTERFACE) -endif() - ######################################################################## # Setup doxygen option ######################################################################## diff --git a/cmake/Modules/FindFec.cmake b/cmake/Modules/FindFec.cmake deleted file mode 100644 index 76b4e42..0000000 --- a/cmake/Modules/FindFec.cmake +++ /dev/null @@ -1,25 +0,0 @@ -INCLUDE(FindPkgConfig) -PKG_CHECK_MODULES(PC_FEC fec) - -FIND_PATH( - FEC_INCLUDE_DIRS - NAMES fec.h - HINTS $ENV{FEC_DIR}/include - ${PC_FEC_INCLUDEDIR} - PATHS /usr/local/include - /usr/include -) - -FIND_LIBRARY( - FEC_LIBRARIES - NAMES fec - HINTS $ENV{FEC_DIR}/lib - ${PC_FEC_LIBDIR} - PATHS /usr/local/lib - /usr/local/lib64 - /usr/lib - /usr/lib64 -) - -INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS(FEC DEFAULT_MSG FEC_LIBRARIES FEC_INCLUDE_DIRS) diff --git a/debian/gr-satnogs.install b/debian/gr-satnogs.install index 39ab9db..adf84e4 100644 --- a/debian/gr-satnogs.install +++ b/debian/gr-satnogs.install @@ -1,6 +1,7 @@ usr/bin/* usr/include/* usr/lib/*/libgnuradio-satnogs.so +usr/lib/*/libgnuradio-satnogs-fec.so usr/lib/*/cmake/* usr/lib/python* usr/share/* diff --git a/debian/libgnuradio-satnogs.install b/debian/libgnuradio-satnogs.install index d95d73e..3ddde58 100644 --- a/debian/libgnuradio-satnogs.install +++ b/debian/libgnuradio-satnogs.install @@ -1,2 +1 @@ -usr/lib/*/libfec.so usr/lib/*/lib*.so.* diff --git a/include/satnogs/CMakeLists.txt b/include/satnogs/CMakeLists.txt index b56563e..465bc09 100644 --- a/include/satnogs/CMakeLists.txt +++ b/include/satnogs/CMakeLists.txt @@ -21,6 +21,8 @@ ######################################################################## # Install public header files ######################################################################## +add_subdirectory(libfec) + list(APPEND DEBUG_HEADER_FILES cw_encoder.h debug_msg_source_raw.h diff --git a/include/satnogs/libfec/CMakeLists.txt b/include/satnogs/libfec/CMakeLists.txt new file mode 100644 index 0000000..cba02b0 --- /dev/null +++ b/include/satnogs/libfec/CMakeLists.txt @@ -0,0 +1,26 @@ +# Copyright 2011,2012 Free Software Foundation, Inc. +# +# This file was generated by gr_modtool, a tool from the GNU Radio framework +# This file is a part of gr-satnogs +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. + +######################################################################## +# Install public header files +######################################################################## +install(FILES + fec.h DESTINATION include/satnogs/libfec +) diff --git a/include/satnogs/libfec/fec.h b/include/satnogs/libfec/fec.h new file mode 100644 index 0000000..66afaf3 --- /dev/null +++ b/include/satnogs/libfec/fec.h @@ -0,0 +1,419 @@ +/* User include file for libfec + * Copyright 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifndef _FEC_H_ +#define _FEC_H_ + +#include +#ifdef __cplusplus +extern "C" { +#endif + + +/* r=1/2 k=7 convolutional encoder polynomials + * The NASA-DSN convention is to use V27POLYA inverted, then V27POLYB + * The CCSDS/NASA-GSFC convention is to use V27POLYB, then V27POLYA inverted + */ +#define V27POLYA 0x6d +#define V27POLYB 0x4f + +SATNOGS_API void *create_viterbi27(int len); +SATNOGS_API void set_viterbi27_polynomial(int polys[2]); +SATNOGS_API int init_viterbi27(void *vp, int starting_state); +SATNOGS_API int update_viterbi27_blk(void *vp, unsigned char sym[], int npairs); +SATNOGS_API int chainback_viterbi27(void *vp, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi27(void *vp); + +#ifdef __VEC__ +SATNOGS_API void *create_viterbi27_av(int len); +SATNOGS_API void set_viterbi27_polynomial_av(int polys[2]); +SATNOGS_API int init_viterbi27_av(void *p, int starting_state); +SATNOGS_API int chainback_viterbi27_av(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi27_av(void *p); +SATNOGS_API int update_viterbi27_blk_av(void *p, unsigned char *syms, + int nbits); +#endif + +#ifdef __i386__ +SATNOGS_API void *create_viterbi27_mmx(int len); +SATNOGS_API void set_viterbi27_polynomial_mmx(int polys[2]); +SATNOGS_API int init_viterbi27_mmx(void *p, int starting_state); +SATNOGS_API int chainback_viterbi27_mmx(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi27_mmx(void *p); +SATNOGS_API int update_viterbi27_blk_mmx(void *p, unsigned char *syms, + int nbits); + +SATNOGS_API void *create_viterbi27_sse(int len); +SATNOGS_API void set_viterbi27_polynomial_sse(int polys[2]); +SATNOGS_API int init_viterbi27_sse(void *p, int starting_state); +SATNOGS_API int chainback_viterbi27_sse(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi27_sse(void *p); +SATNOGS_API int update_viterbi27_blk_sse(void *p, unsigned char *syms, + int nbits); + +SATNOGS_API void *create_viterbi27_sse2(int len); +SATNOGS_API void set_viterbi27_polynomial_sse2(int polys[2]); +SATNOGS_API int init_viterbi27_sse2(void *p, int starting_state); +SATNOGS_API int chainback_viterbi27_sse2(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi27_sse2(void *p); +SATNOGS_API int update_viterbi27_blk_sse2(void *p, unsigned char *syms, + int nbits); +#endif + +SATNOGS_API void *create_viterbi27_port(int len); +SATNOGS_API void set_viterbi27_polynomial_port(int polys[2]); +SATNOGS_API int init_viterbi27_port(void *p, int starting_state); +SATNOGS_API int chainback_viterbi27_port(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi27_port(void *p); +SATNOGS_API int update_viterbi27_blk_port(void *p, unsigned char *syms, + int nbits); + +/* r=1/2 k=9 convolutional encoder polynomials */ +#define V29POLYA 0x1af +#define V29POLYB 0x11d + +SATNOGS_API void *create_viterbi29(int len); +SATNOGS_API void set_viterbi29_polynomial(int polys[2]); +SATNOGS_API int init_viterbi29(void *vp, int starting_state); +SATNOGS_API int update_viterbi29_blk(void *vp, unsigned char syms[], int nbits); +SATNOGS_API int chainback_viterbi29(void *vp, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi29(void *vp); + +#ifdef __VEC__ +SATNOGS_API void *create_viterbi29_av(int len); +SATNOGS_API void set_viterbi29_polynomial_av(int polys[2]); +SATNOGS_API int init_viterbi29_av(void *p, int starting_state); +SATNOGS_API int chainback_viterbi29_av(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi29_av(void *p); +SATNOGS_API int update_viterbi29_blk_av(void *p, unsigned char *syms, + int nbits); +#endif + +#ifdef __i386__ +SATNOGS_API void *create_viterbi29_mmx(int len); +SATNOGS_API void set_viterbi29_polynomial_mmx(int polys[2]); +SATNOGS_API int chainback_viterbi29_mmx(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi29_mmx(void *p); +SATNOGS_API int update_viterbi29_blk_mmx(void *p, unsigned char *syms, + int nbits); + +SATNOGS_API void *create_viterbi29_sse(int len); +SATNOGS_API void set_viterbi29_polynomial_sse(int polys[2]); +SATNOGS_API int init_viterbi29_sse(void *p, int starting_state); +SATNOGS_API int chainback_viterbi29_sse(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi29_sse(void *p); +SATNOGS_API int update_viterbi29_blk_sse(void *p, unsigned char *syms, + int nbits); + +SATNOGS_API void *create_viterbi29_sse2(int len); +SATNOGS_API void set_viterbi29_polynomial_sse2(int polys[2]); +SATNOGS_API int init_viterbi29_sse2(void *p, int starting_state); +SATNOGS_API int chainback_viterbi29_sse2(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi29_sse2(void *p); +SATNOGS_API int update_viterbi29_blk_sse2(void *p, unsigned char *syms, + int nbits); +#endif + +SATNOGS_API void *create_viterbi29_port(int len); +SATNOGS_API void set_viterbi29_polynomial_port(int polys[2]); +SATNOGS_API int init_viterbi29_port(void *p, int starting_state); +SATNOGS_API int chainback_viterbi29_port(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi29_port(void *p); +SATNOGS_API int update_viterbi29_blk_port(void *p, unsigned char *syms, + int nbits); + +/* r=1/3 k=9 convolutional encoder polynomials */ +#define V39POLYA 0x1ed +#define V39POLYB 0x19b +#define V39POLYC 0x127 + +SATNOGS_API void *create_viterbi39(int len); +SATNOGS_API void set_viterbi39_polynomial(int polys[3]); +SATNOGS_API int init_viterbi39(void *vp, int starting_state); +SATNOGS_API int update_viterbi39_blk(void *vp, unsigned char syms[], int nbits); +SATNOGS_API int chainback_viterbi39(void *vp, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi39(void *vp); + +#ifdef __VEC__ +SATNOGS_API void *create_viterbi39_av(int len); +SATNOGS_API void set_viterbi39_polynomial_av(int polys[3]); +SATNOGS_API int init_viterbi39_av(void *p, int starting_state); +SATNOGS_API int chainback_viterbi39_av(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi39_av(void *p); +SATNOGS_API int update_viterbi39_blk_av(void *p, unsigned char *syms, + int nbits); +#endif + +#ifdef __i386__ +SATNOGS_API void *create_viterbi39_mmx(int len); +SATNOGS_API void set_viterbi39_polynomial_mmx(int polys[3]); +SATNOGS_API int init_viterbi39_mmx(void *p, int starting_state); +SATNOGS_API int chainback_viterbi39_mmx(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi39_mmx(void *p); +SATNOGS_API int update_viterbi39_blk_mmx(void *p, unsigned char *syms, + int nbits); + +SATNOGS_API void *create_viterbi39_sse(int len); +SATNOGS_API void set_viterbi39_polynomial_sse(int polys[3]); +SATNOGS_API int init_viterbi39_sse(void *p, int starting_state); +SATNOGS_API int chainback_viterbi39_sse(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi39_sse(void *p); +SATNOGS_API int update_viterbi39_blk_sse(void *p, unsigned char *syms, + int nbits); + +SATNOGS_API void *create_viterbi39_sse2(int len); +SATNOGS_API void set_viterbi39_polynomial_sse2(int polys[3]); +SATNOGS_API int init_viterbi39_sse2(void *p, int starting_state); +SATNOGS_API int chainback_viterbi39_sse2(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi39_sse2(void *p); +SATNOGS_API int update_viterbi39_blk_sse2(void *p, unsigned char *syms, + int nbits); +#endif + +SATNOGS_API void *create_viterbi39_port(int len); +SATNOGS_API void set_viterbi39_polynomial_port(int polys[3]); +SATNOGS_API int init_viterbi39_port(void *p, int starting_state); +SATNOGS_API int chainback_viterbi39_port(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi39_port(void *p); +SATNOGS_API int update_viterbi39_blk_port(void *p, unsigned char *syms, + int nbits); + + +/* r=1/6 k=15 Cassini convolutional encoder polynomials without symbol inversion + * dfree = 56 + * These bits may be left-right flipped from some textbook representations; + * here I have the bits entering the shift register from the right (low) end + * + * Some other spacecraft use the same code, but with the polynomials in a different order. + * E.g., Mars Pathfinder and STEREO swap POLYC and POLYD. All use alternate symbol inversion, + * so use set_viterbi615_polynomial() as appropriate. + */ +#define V615POLYA 042631 +#define V615POLYB 047245 +#define V615POLYC 056507 +#define V615POLYD 073363 +#define V615POLYE 077267 +#define V615POLYF 064537 + +SATNOGS_API void *create_viterbi615(int len); +SATNOGS_API void set_viterbi615_polynomial(int polys[6]); +SATNOGS_API int init_viterbi615(void *vp, int starting_state); +SATNOGS_API int update_viterbi615_blk(void *vp, unsigned char *syms, int nbits); +SATNOGS_API int chainback_viterbi615(void *vp, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi615(void *vp); + +#ifdef __VEC__ +SATNOGS_API void *create_viterbi615_av(int len); +SATNOGS_API void set_viterbi615_polynomial_av(int polys[6]); +SATNOGS_API int init_viterbi615_av(void *p, int starting_state); +SATNOGS_API int chainback_viterbi615_av(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi615_av(void *p); +SATNOGS_API int update_viterbi615_blk_av(void *p, unsigned char *syms, + int nbits); +#endif + +#ifdef __i386__ +SATNOGS_API void *create_viterbi615_mmx(int len); +SATNOGS_API void set_viterbi615_polynomial_mmx(int polys[6]); +SATNOGS_API int init_viterbi615_mmx(void *p, int starting_state); +SATNOGS_API int chainback_viterbi615_mmx(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi615_mmx(void *p); +SATNOGS_API int update_viterbi615_blk_mmx(void *p, unsigned char *syms, + int nbits); + +SATNOGS_API void *create_viterbi615_sse(int len); +SATNOGS_API void set_viterbi615_polynomial_sse(int polys[6]); +SATNOGS_API int init_viterbi615_sse(void *p, int starting_state); +SATNOGS_API int chainback_viterbi615_sse(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi615_sse(void *p); +SATNOGS_API int update_viterbi615_blk_sse(void *p, unsigned char *syms, + int nbits); + +SATNOGS_API void *create_viterbi615_sse2(int len); +SATNOGS_API void set_viterbi615_polynomial_sse2(int polys[6]); +SATNOGS_API int init_viterbi615_sse2(void *p, int starting_state); +SATNOGS_API int chainback_viterbi615_sse2(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi615_sse2(void *p); +SATNOGS_API int update_viterbi615_blk_sse2(void *p, unsigned char *syms, + int nbits); +#endif + +SATNOGS_API void *create_viterbi615_port(int len); +SATNOGS_API void set_viterbi615_polynomial_port(int polys[6]); +SATNOGS_API int init_viterbi615_port(void *p, int starting_state); +SATNOGS_API int chainback_viterbi615_port(void *p, unsigned char *data, + unsigned int nbits, unsigned int endstate); +SATNOGS_API void delete_viterbi615_port(void *p); +SATNOGS_API int update_viterbi615_blk_port(void *p, unsigned char *syms, + int nbits); + + +/* General purpose RS codec, 8-bit symbols */ +SATNOGS_API void encode_rs_char(void *rs, unsigned char *data, + unsigned char *parity); +SATNOGS_API int decode_rs_char(void *rs, unsigned char *data, int *eras_pos, + int no_eras); +SATNOGS_API void *init_rs_char(int symsize, int gfpoly, + int fcr, int prim, int nroots, + int pad); +SATNOGS_API void free_rs_char(void *rs); + +/* General purpose RS codec, integer symbols */ +SATNOGS_API void encode_rs_int(void *rs, int *data, int *parity); +SATNOGS_API int decode_rs_int(void *rs, int *data, int *eras_pos, int no_eras); +SATNOGS_API void *init_rs_int(int symsize, int gfpoly, int fcr, + int prim, int nroots, int pad); +SATNOGS_API void free_rs_int(void *rs); + +/* CCSDS standard (255,223) RS codec with conventional (*not* dual-basis) + * symbol representation + */ +SATNOGS_API void encode_rs_8(unsigned char *data, unsigned char *parity, + int pad); +SATNOGS_API int decode_rs_8(unsigned char *data, int *eras_pos, int no_eras, + int pad); + +/* CCSDS standard (255,223) RS codec with dual-basis symbol representation */ +SATNOGS_API void encode_rs_ccsds(unsigned char *data, unsigned char *parity, + int pad); +SATNOGS_API int decode_rs_ccsds(unsigned char *data, int *eras_pos, int no_eras, + int pad); + +/* Tables to map from conventional->dual (Taltab) and + * dual->conventional (Tal1tab) bases + */ +extern unsigned char Taltab[], Tal1tab[]; + + +/* CPU SIMD instruction set available */ +SATNOGS_API extern enum cpu_mode {UNKNOWN = 0, PORT, MMX, SSE, SSE2, ALTIVEC} Cpu_mode; +SATNOGS_API void find_cpu_mode( + void); /* Call this once at startup to set Cpu_mode */ + +/* Determine parity of argument: 1 = odd, 0 = even */ +#if defined(__i386__) || defined(__x86_64__) +static inline int parityb(unsigned char x) +{ + __asm__ __volatile__("test %1,%1;setpo %0" : "=q"(x) : "q"(x)); + return x; +} +#else +void partab_init(); + +static inline int parityb(unsigned char x) +{ + extern unsigned char Partab[256]; + extern int P_init; + if (!P_init) { + partab_init(); + } + return Partab[x]; +} +#endif + + +static inline int parity(int x) +{ + /* Fold down to one byte */ + x ^= (x >> 16); + x ^= (x >> 8); + return parityb(x); +} + +/* Useful utilities for simulation */ +SATNOGS_API double normal_rand(double mean, double std_dev); +SATNOGS_API unsigned char addnoise(int sym, double amp, double gain, + double offset, int clip); + +extern int Bitcnt[]; + +/* Dot product functions */ +SATNOGS_API void *initdp(signed short coeffs[], int len); +SATNOGS_API void freedp(void *dp); +SATNOGS_API long dotprod(void *dp, signed short a[]); + +SATNOGS_API void *initdp_port(signed short coeffs[], int len); +SATNOGS_API void freedp_port(void *dp); +SATNOGS_API long dotprod_port(void *dp, signed short a[]); + +#ifdef __i386__ +SATNOGS_API void *initdp_mmx(signed short coeffs[], int len); +SATNOGS_API void freedp_mmx(void *dp); +SATNOGS_API long dotprod_mmx(void *dp, signed short a[]); + +SATNOGS_API void *initdp_sse(signed short coeffs[], int len); +SATNOGS_API void freedp_sse(void *dp); +SATNOGS_API long dotprod_sse(void *dp, signed short a[]); + +SATNOGS_API void *initdp_sse2(signed short coeffs[], int len); +SATNOGS_API void freedp_sse2(void *dp); +SATNOGS_API long dotprod_sse2(void *dp, signed short a[]); +#endif + +#ifdef __x86_64__ +SATNOGS_API void *initdp_sse2(signed short coeffs[], int len); +SATNOGS_API void freedp_sse2(void *dp); +SATNOGS_API long dotprod_sse2(void *dp, signed short a[]); +#endif + +#ifdef __VEC__ +SATNOGS_API void *initdp_av(signed short coeffs[], int len); +SATNOGS_API void freedp_av(void *dp); +SATNOGS_API long dotprod_av(void *dp, signed short a[]); +#endif + +/* Sum of squares - accepts signed shorts, produces unsigned long long */ +SATNOGS_API unsigned long long sumsq(signed short *in, int cnt); +SATNOGS_API unsigned long long sumsq_port(signed short *in, int cnt); + +#ifdef __i386__ +SATNOGS_API unsigned long long sumsq_mmx(signed short *in, int cnt); +SATNOGS_API unsigned long long sumsq_sse(signed short *in, int cnt); +SATNOGS_API unsigned long long sumsq_sse2(signed short *in, int cnt); +#endif +#ifdef __x86_64__ +SATNOGS_API unsigned long long sumsq_sse2(signed short *in, int cnt); +#endif +#ifdef __VEC__ +SATNOGS_API unsigned long long sumsq_av(signed short *in, int cnt); +#endif + + +/* Low-level data structures and routines */ + +SATNOGS_API int cpu_features(void); + +#ifdef __cplusplus +} +#endif + + +#endif /* _FEC_H_ */ + + + diff --git a/include/satnogs/utils.h b/include/satnogs/utils.h index 03f76cd..9d1ac4c 100644 --- a/include/satnogs/utils.h +++ b/include/satnogs/utils.h @@ -23,6 +23,7 @@ #include #include +#include #include namespace gr { @@ -180,7 +181,7 @@ update_crc32(uint32_t crc, const uint8_t *data, size_t len) 0x2D02EF8DL }; - register uint32_t i; + uint32_t i; for (i = 0; i < len; i++) { crc = (crc >> 8) ^ crc32_lut[(crc ^ data[i]) & 0xff]; } diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index c47bce0..d3b5698 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -23,6 +23,8 @@ ######################################################################## include(GrPlatform) #define LIB_SUFFIX +add_subdirectory(libfec) + list(APPEND satnogs_debug_sources morse_debug_source_impl.cc debug_msg_source_impl.cc @@ -78,23 +80,26 @@ if(NOT satnogs_sources) endif(NOT satnogs_sources) add_library(gnuradio-satnogs SHARED ${satnogs_sources}) -add_dependencies(gnuradio-satnogs fec) +add_dependencies(gnuradio-satnogs gnuradio-satnogs-fec) -target_link_libraries(gnuradio-satnogs PUBLIC +target_link_libraries(gnuradio-satnogs + gnuradio-satnogs-fec gnuradio::gnuradio-runtime - gnuradio::gnuradio-fft + gnuradio::gnuradio-analog gnuradio::gnuradio-blocks + gnuradio::gnuradio-fft gnuradio::gnuradio-digital gnuradio::gnuradio-pmt ${VOLK_LIBRARIES} ${OGGVORBIS_LIBRARIES} ${PNG_LIBRARIES} ${png++_LIBRARIES} - ${FEC_LIBRARIES} ${JSONCPP_LIBRARY} ) + target_include_directories(gnuradio-satnogs PUBLIC $ + PUBLIC $ PUBLIC $ ) set_target_properties(gnuradio-satnogs PROPERTIES DEFINE_SYMBOL "gnuradio_satnogs_EXPORTS") @@ -128,6 +133,7 @@ include(GrTest) list(APPEND test_satnogs_sources qa_golay24.cc ) + # Anything we need to link to for the unit tests go here list(APPEND GR_TEST_TARGET_DEPS gnuradio-satnogs) diff --git a/lib/amsat_duv_decoder.cc b/lib/amsat_duv_decoder.cc index 6626ebe..eb88fe9 100644 --- a/lib/amsat_duv_decoder.cc +++ b/lib/amsat_duv_decoder.cc @@ -26,10 +26,7 @@ #include #include #include - -extern "C" { -#include -} +#include namespace gr { namespace satnogs { diff --git a/lib/ax100_decoder.cc b/lib/ax100_decoder.cc index 44a6b00..68ebd92 100644 --- a/lib/ax100_decoder.cc +++ b/lib/ax100_decoder.cc @@ -28,10 +28,7 @@ #include #include #include - -extern "C" { -#include -} +#include namespace gr { namespace satnogs { diff --git a/lib/golay24.cc b/lib/golay24.cc index a8201c7..6296638 100644 --- a/lib/golay24.cc +++ b/lib/golay24.cc @@ -23,9 +23,7 @@ #include "config.h" #endif -#include #include - #include namespace gr { diff --git a/libfec/CMakeLists.txt b/lib/libfec/CMakeLists.txt similarity index 54% rename from libfec/CMakeLists.txt rename to lib/libfec/CMakeLists.txt index 684a6d0..7ede3ed 100644 --- a/libfec/CMakeLists.txt +++ b/lib/libfec/CMakeLists.txt @@ -1,42 +1,7 @@ -######################################################################## -# Project setup -######################################################################## -cmake_minimum_required(VERSION 2.8) -project(libfec ASM C) - -option(BUILD_32BIT_ON_64BIT "Build a 32-bit library on a 64-bit system" OFF) - -# Select the release build type by default to get optimization flags -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release") - message(STATUS "Build type not specified: defaulting to release.") -endif(NOT CMAKE_BUILD_TYPE) -set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "") - -list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) - -if(NOT LIB_INSTALL_DIR) - set(LIB_INSTALL_DIR lib) -endif() - - -######################################################################## -# Version information -######################################################################## -set(VERSION_INFO_MAJOR 3) -set(VERSION_INFO_MINOR 0) -set(VERSION_INFO_PATCH 0) - -if(NOT DEFINED VERSION_INFO_EXTRA) - set(VERSION_INFO_EXTRA "git") -endif() -include(Version) - -if(NOT DEFINED VERSION) - #set(VERSION "\"${VERSION_INFO_MAJOR}.${VERSION_INFO_MINOR}.${VERSION_INFO_PATCH}\"") - set(VERSION "\"${VERSION_INFO}\"") -endif() - +include_directories( + ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/include/satnogs/libfec +) ######################################################################## # Compiler specific setup @@ -125,9 +90,9 @@ if(TARGET_ARCH MATCHES "x64") sumsq.c sumsq_port.c cpu_mode_x86_64.c - ##asm - #sse2bfly27-64.s - #sse2bfly29-64.s + ##asm + #sse2bfly27-64.s + #sse2bfly29-64.s ) elseif(TARGET_ARCH MATCHES "x86") @@ -154,24 +119,24 @@ elseif(TARGET_ARCH MATCHES "x86") sumsq_sse2.c sumsq_mmx.c cpu_mode_x86.c - #asm - cpu_features.s - dotprod_mmx_assist.s - dotprod_sse2_assist.s - mmxbfly27.s - mmxbfly29.s - peak_mmx_assist.s - peak_sse2_assist.s - peak_sse_assist.s - peakval_mmx_assist.s - peakval_sse2_assist.s - peakval_sse_assist.s - sse2bfly27.s - sse2bfly29.s - ssebfly27.s - ssebfly29.s - sumsq_mmx_assist.s - sumsq_sse2_assist.s + #asm + cpu_features.s + dotprod_mmx_assist.s + dotprod_sse2_assist.s + mmxbfly27.s + mmxbfly29.s + peak_mmx_assist.s + peak_sse2_assist.s + peak_sse_assist.s + peakval_mmx_assist.s + peakval_sse2_assist.s + peakval_sse_assist.s + sse2bfly27.s + sse2bfly29.s + ssebfly27.s + ssebfly29.s + sumsq_mmx_assist.s + sumsq_sse2_assist.s ) elseif(TARGET_ARCH MATCHES "ppc|ppc64") @@ -225,47 +190,6 @@ list(APPEND libfec_sources ccsds_tal.c ) - -################################################################################ -# Generate pkg-config file -################################################################################ -foreach(inc ${LIBFEC_INCLUDE_DIR}) - list(APPEND LIBFEC_PC_CFLAGS "-I${inc}") -endforeach() - -foreach(lib ${LIBFEC_LIBRARY_DIRS}) - list(APPEND LIBFEC_PC_PRIV_LIBS "-L${lib}") -endforeach() - -set(LIBFEC_PC_PREFIX ${CMAKE_INSTALL_PREFIX}) -set(LIBFEC_PC_EXEC_PREFIX \${prefix}) -set(LIBFEC_PC_LIBDIR \${exec_prefix}/${LIB_INSTALL_DIR}) -set(LIBFEC_PC_INCLUDEDIR \${prefix}/include) -set(LIBFEC_PC_VERSION ${VERSION}) -set(LIBFEC_PC_LIBS "-lfec") - -# Use space-delimiter in the .pc file, rather than CMake's semicolon separator -string(REPLACE ";" " " LIBFEC_PC_CFLAGS "${LIBFEC_PC_CFLAGS}") -string(REPLACE ";" " " LIBFEC_PC_LIBS "${LIBFEC_PC_LIBS}") - -# Unset these to avoid hard-coded paths in a cross-environment -if(CMAKE_CROSSCOMPILING) - unset(LIBFEC_PC_CFLAGS) - unset(LIBFEC_PC_LIBS) -endif() - -configure_file( - ${CMAKE_CURRENT_SOURCE_DIR}/libfec.pc.in - ${CMAKE_CURRENT_BINARY_DIR}/libfec.pc - @ONLY -) - -install( - FILES ${CMAKE_CURRENT_BINARY_DIR}/libfec.pc - DESTINATION ${LIB_INSTALL_DIR}/pkgconfig/ -) - - ######################################################################## # Setup libraries ######################################################################## @@ -273,51 +197,41 @@ install( # generate ccsds_tab.c add_executable(gen_ccsds gen_ccsds.c init_rs_char.c) add_custom_command( - OUTPUT ${CMAKE_BINARY_DIR}/ccsds_tab.c - COMMAND ${CMAKE_BINARY_DIR}/gen_ccsds > ccsds_tab.c + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ccsds_tab.c + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/gen_ccsds > ccsds_tab.c DEPENDS gen_ccsds ) # generate ccsds_tal.c add_executable(gen_ccsds_tal gen_ccsds_tal.c) add_custom_command( - OUTPUT ${CMAKE_BINARY_DIR}/ccsds_tal.c - COMMAND ${CMAKE_BINARY_DIR}/gen_ccsds_tal > ccsds_tal.c + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ccsds_tal.c + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/gen_ccsds_tal > ccsds_tal.c DEPENDS gen_ccsds_tal ) # libfec -add_library(libfec_shared SHARED ${libfec_sources}) -set_target_properties(libfec_shared PROPERTIES OUTPUT_NAME fec) -target_link_libraries(libfec_shared ${M_LIB}) +add_library(gnuradio-satnogs-fec SHARED ${libfec_sources}) +target_link_libraries(gnuradio-satnogs-fec ${M_LIB}) -install(TARGETS libfec_shared - DESTINATION ${LIB_INSTALL_DIR}) -install(FILES "${PROJECT_SOURCE_DIR}/fec.h" - DESTINATION include) +target_include_directories(gnuradio-satnogs-fec + PUBLIC + $ + $ + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/ + ) +if(APPLE) + set_target_properties(gnuradio-satnogs-fec PROPERTIES + INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib" + ) +endif(APPLE) ######################################################################## -# Create uninstall target +# Install built library files ######################################################################## -configure_file( - "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in" - "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" - IMMEDIATE @ONLY) - -add_custom_target(uninstall - COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) +include(GrMiscUtils) +GR_LIBRARY_FOO(gnuradio-satnogs-fec) -######################################################################## -# Print Summary -######################################################################## -message(STATUS "") -message(STATUS "##########################################################") -message(STATUS "## Building for version: ${VERSION}") -message(STATUS "## Target Architecture: ${TARGET_ARCH}") -message(STATUS "## Using install prefix: ${CMAKE_INSTALL_PREFIX}") -message(STATUS "##########################################################") -message(STATUS "") - diff --git a/libfec/INSTALL b/lib/libfec/INSTALL similarity index 100% rename from libfec/INSTALL rename to lib/libfec/INSTALL diff --git a/libfec/LICENSE b/lib/libfec/LICENSE similarity index 100% rename from libfec/LICENSE rename to lib/libfec/LICENSE diff --git a/libfec/README b/lib/libfec/README similarity index 100% rename from libfec/README rename to lib/libfec/README diff --git a/libfec/README.x86-64 b/lib/libfec/README.x86-64 similarity index 100% rename from libfec/README.x86-64 rename to lib/libfec/README.x86-64 diff --git a/libfec/bootstrap b/lib/libfec/bootstrap similarity index 100% rename from libfec/bootstrap rename to lib/libfec/bootstrap diff --git a/libfec/ccsds.h b/lib/libfec/ccsds.h similarity index 60% rename from libfec/ccsds.h rename to lib/libfec/ccsds.h index ae65468..363d064 100644 --- a/libfec/ccsds.h +++ b/lib/libfec/ccsds.h @@ -1,5 +1,5 @@ typedef unsigned char data_t; -extern unsigned char Taltab[],Tal1tab[]; +extern unsigned char Taltab[], Tal1tab[]; #define NN 255 #define NROOTS 32 diff --git a/libfec/char.h b/lib/libfec/char.h similarity index 93% rename from libfec/char.h rename to lib/libfec/char.h index 25efd65..abb85c7 100644 --- a/libfec/char.h +++ b/lib/libfec/char.h @@ -9,7 +9,7 @@ typedef unsigned char data_t; #define MM (rs->mm) #define NN (rs->nn) -#define ALPHA_TO (rs->alpha_to) +#define ALPHA_TO (rs->alpha_to) #define INDEX_OF (rs->index_of) #define GENPOLY (rs->genpoly) #define NROOTS (rs->nroots) diff --git a/libfec/cmake/Modules/Version.cmake b/lib/libfec/cmake/Modules/Version.cmake similarity index 100% rename from libfec/cmake/Modules/Version.cmake rename to lib/libfec/cmake/Modules/Version.cmake diff --git a/libfec/cmake/cmake_uninstall.cmake.in b/lib/libfec/cmake/cmake_uninstall.cmake.in similarity index 100% rename from libfec/cmake/cmake_uninstall.cmake.in rename to lib/libfec/cmake/cmake_uninstall.cmake.in diff --git a/libfec/config.guess b/lib/libfec/config.guess similarity index 100% rename from libfec/config.guess rename to lib/libfec/config.guess diff --git a/libfec/config.sub b/lib/libfec/config.sub similarity index 100% rename from libfec/config.sub rename to lib/libfec/config.sub diff --git a/libfec/configure.in b/lib/libfec/configure.in similarity index 100% rename from libfec/configure.in rename to lib/libfec/configure.in diff --git a/libfec/cpu_features.s b/lib/libfec/cpu_features.s similarity index 100% rename from libfec/cpu_features.s rename to lib/libfec/cpu_features.s diff --git a/libfec/cpu_mode_generic.c b/lib/libfec/cpu_mode_generic.c similarity index 90% rename from libfec/cpu_mode_generic.c rename to lib/libfec/cpu_mode_generic.c index 500f995..7bd5ad5 100644 --- a/libfec/cpu_mode_generic.c +++ b/lib/libfec/cpu_mode_generic.c @@ -8,6 +8,7 @@ enum cpu_mode Cpu_mode; // Use the portable code for this unknown CPU -void find_cpu_mode(void) { +void find_cpu_mode(void) +{ Cpu_mode = PORT; } diff --git a/lib/libfec/cpu_mode_ppc.c b/lib/libfec/cpu_mode_ppc.c new file mode 100644 index 0000000..ecbeb96 --- /dev/null +++ b/lib/libfec/cpu_mode_ppc.c @@ -0,0 +1,45 @@ +/* Determine CPU support for SIMD on Power PC + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include "fec.h" +#ifdef __VEC__ +#include +#endif + +/* Various SIMD instruction set names */ +char *Cpu_modes[] = {"Unknown", "Portable C", "x86 Multi Media Extensions (MMX)", + "x86 Streaming SIMD Extensions (SSE)", + "x86 Streaming SIMD Extensions 2 (SSE2)", + "PowerPC G4/G5 Altivec/Velocity Engine" + }; + +enum cpu_mode Cpu_mode; + +void find_cpu_mode(void) +{ + + if (Cpu_mode != UNKNOWN) { + return; + } + +#ifdef __VEC__ + { + /* Ask the OS if we have Altivec support */ + int selectors[2] = { CTL_HW, HW_VECTORUNIT }; + int hasVectorUnit = 0; + size_t length = sizeof(hasVectorUnit); + int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); + if (0 == error && hasVectorUnit) { + Cpu_mode = ALTIVEC; + } + else { + Cpu_mode = PORT; + } + } +#else + Cpu_mode = PORT; +#endif + + fprintf(stderr, "SIMD CPU detect: %s\n", Cpu_modes[Cpu_mode]); +} diff --git a/lib/libfec/cpu_mode_x86.c b/lib/libfec/cpu_mode_x86.c new file mode 100644 index 0000000..e2e1a53 --- /dev/null +++ b/lib/libfec/cpu_mode_x86.c @@ -0,0 +1,39 @@ +/* Determine CPU support for SIMD + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include "fec.h" + +/* Various SIMD instruction set names */ +char *Cpu_modes[] = {"Unknown", "Portable C", "x86 Multi Media Extensions (MMX)", + "x86 Streaming SIMD Extensions (SSE)", + "x86 Streaming SIMD Extensions 2 (SSE2)", + "PowerPC G4/G5 Altivec/Velocity Engine" + }; + +enum cpu_mode Cpu_mode; + +void find_cpu_mode(void) +{ + + int f; + if (Cpu_mode != UNKNOWN) { + return; + } + + /* Figure out what kind of CPU we have */ + f = cpu_features(); + if (f & (1 << 26)) { /* SSE2 is present */ + Cpu_mode = SSE2; + } + else if (f & (1 << 25)) { /* SSE is present */ + Cpu_mode = SSE; + } + else if (f & (1 << 23)) { /* MMX is present */ + Cpu_mode = MMX; + } + else { /* No SIMD at all */ + Cpu_mode = PORT; + } + fprintf(stderr, "SIMD CPU detect: %s\n", Cpu_modes[Cpu_mode]); +} diff --git a/lib/libfec/cpu_mode_x86_64.c b/lib/libfec/cpu_mode_x86_64.c new file mode 100644 index 0000000..aa6277b --- /dev/null +++ b/lib/libfec/cpu_mode_x86_64.c @@ -0,0 +1,30 @@ +/* Determine CPU support for SIMD + * Copyright 2004 Phil Karn, KA9Q + * + * Modified in 2012 by Matthias P. Braendli, HB9EGM + */ +#include +#include "fec.h" + +/* Various SIMD instruction set names */ +char *Cpu_modes[] = {"Unknown", "Portable C", "x86 Multi Media Extensions (MMX)", + "x86 Streaming SIMD Extensions (SSE)", + "x86 Streaming SIMD Extensions 2 (SSE2)", + "PowerPC G4/G5 Altivec/Velocity Engine" + }; + +enum cpu_mode Cpu_mode; + +void find_cpu_mode(void) +{ + + int f; + if (Cpu_mode != UNKNOWN) { + return; + } + + /* According to the wikipedia entry x86-64, all x86-64 processors have SSE2 */ + /* The same assumption is also in other source files ! */ + Cpu_mode = SSE2; + fprintf(stderr, "CPU: x86-64, using portable C implementation\n"); +} diff --git a/libfec/decode_rs.c b/lib/libfec/decode_rs.c similarity index 51% rename from libfec/decode_rs.c rename to lib/libfec/decode_rs.c index d7f97b3..c170ea7 100644 --- a/libfec/decode_rs.c +++ b/lib/libfec/decode_rs.c @@ -10,7 +10,7 @@ #include #define NULL ((void *)0) -#define min(a,b) ((a) < (b) ? (a) : (b)) +#define min(a,b) ((a) < (b) ? (a) : (b)) #ifdef FIXED #include "fixed.h" @@ -22,43 +22,48 @@ int DECODE_RS( #ifdef FIXED -data_t *data, int *eras_pos, int no_eras,int pad){ + data_t *data, int *eras_pos, int no_eras, int pad) +{ #else -void *p,data_t *data, int *eras_pos, int no_eras){ + void *p, data_t *data, int *eras_pos, int no_eras) +{ struct rs *rs = (struct rs *)p; #endif int deg_lambda, el, deg_omega; - int i, j, r,k; - data_t u,q,tmp,num1,num2,den,discr_r; - data_t lambda[NROOTS+1], s[NROOTS]; /* Err+Eras Locator poly - * and syndrome poly */ - data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1]; - data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS]; + int i, j, r, k; + data_t u, q, tmp, num1, num2, den, discr_r; + data_t lambda[NROOTS + 1], s[NROOTS]; /* Err+Eras Locator poly + * and syndrome poly */ + data_t b[NROOTS + 1], t[NROOTS + 1], omega[NROOTS + 1]; + data_t root[NROOTS], reg[NROOTS + 1], loc[NROOTS]; int syn_error, count; #ifdef FIXED /* Check pad parameter for validity */ - if(pad < 0 || pad >= NN) + if (pad < 0 || pad >= NN) { return -1; + } #endif /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */ - for(i=0;i 0) { /* Init lambda to be the erasure locator polynomial */ - lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))]; + lambda[1] = ALPHA_TO[MODNN(PRIM * (NN - 1 - eras_pos[0]))]; for (i = 1; i < no_eras; i++) { - u = MODNN(PRIM*(NN-1-eras_pos[i])); - for (j = i+1; j > 0; j--) { - tmp = INDEX_OF[lambda[j - 1]]; - if(tmp != A0) - lambda[j] ^= ALPHA_TO[MODNN(u + tmp)]; + u = MODNN(PRIM * (NN - 1 - eras_pos[i])); + for (j = i + 1; j > 0; j--) { + tmp = INDEX_OF[lambda[j - 1]]; + if (tmp != A0) { + lambda[j] ^= ALPHA_TO[MODNN(u + tmp)]; + } } } #if DEBUG >= 1 /* Test code that verifies the erasure locator polynomial just constructed Needed only for decoder debugging. */ - + /* find roots of the erasure location polynomial */ - for(i=1;i<=no_eras;i++) + for (i = 1; i <= no_eras; i++) { reg[i] = INDEX_OF[lambda[i]]; + } count = 0; - for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) { + for (i = 1, k = IPRIM - 1; i <= NN; i++, k = MODNN(k + IPRIM)) { q = 1; for (j = 1; j <= no_eras; j++) - if (reg[j] != A0) { - reg[j] = MODNN(reg[j] + j); - q ^= ALPHA_TO[reg[j]]; - } - if (q != 0) - continue; + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + if (q != 0) { + continue; + } /* store root and error location number indices */ root[count] = i; loc[count] = k; count++; } if (count != no_eras) { - printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras); + printf("count = %d no_eras = %d\n lambda(x) is WRONG\n", count, no_eras); count = -1; goto finish; } #if DEBUG >= 2 printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n"); - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { printf("%d ", loc[i]); + } printf("\n"); #endif #endif } - for(i=0;i 0; j--){ + for (j = deg_lambda; j > 0; j--) { if (reg[j] != A0) { - reg[j] = MODNN(reg[j] + j); - q ^= ALPHA_TO[reg[j]]; + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; } } - if (q != 0) - continue; /* Not a root */ + if (q != 0) { + continue; /* Not a root */ + } /* store root (index-form) and error location number */ #if DEBUG>=2 - printf("count %d root %d loc %d\n",count,i,k); + printf("count %d root %d loc %d\n", count, i, k); #endif root[count] = i; loc[count] = k; /* If we've already found max possible roots, * abort the search to save time */ - if(++count == deg_lambda) + if (++count == deg_lambda) { break; + } } if (deg_lambda != count) { /* @@ -213,12 +231,13 @@ void *p,data_t *data, int *eras_pos, int no_eras){ * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo * x**NROOTS). in index form. Also find deg(omega). */ - deg_omega = deg_lambda-1; - for (i = 0; i <= deg_omega;i++){ + deg_omega = deg_lambda - 1; + for (i = 0; i <= deg_omega; i++) { tmp = 0; - for(j=i;j >= 0; j--){ - if ((s[i - j] != A0) && (lambda[j] != A0)) - tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])]; + for (j = i; j >= 0; j--) { + if ((s[i - j] != A0) && (lambda[j] != A0)) { + tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])]; + } } omega[i] = INDEX_OF[tmp]; } @@ -227,19 +246,21 @@ void *p,data_t *data, int *eras_pos, int no_eras){ * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 = * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form */ - for (j = count-1; j >=0; j--) { + for (j = count - 1; j >= 0; j--) { num1 = 0; for (i = deg_omega; i >= 0; i--) { - if (omega[i] != A0) - num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])]; + if (omega[i] != A0) { + num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])]; + } } num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)]; den = 0; - + /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */ - for (i = min(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) { - if(lambda[i+1] != A0) - den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])]; + for (i = min(deg_lambda, NROOTS - 1) & ~1; i >= 0; i -= 2) { + if (lambda[i + 1] != A0) { + den ^= ALPHA_TO[MODNN(lambda[i + 1] + i * root[j])]; + } } #if DEBUG >= 1 if (den == 0) { @@ -250,13 +271,15 @@ void *p,data_t *data, int *eras_pos, int no_eras){ #endif /* Apply error to data */ if (num1 != 0 && loc[j] >= PAD) { - data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])]; + data[loc[j] - PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - + INDEX_OF[den])]; } } - finish: - if(eras_pos != NULL){ - for(i=0;i 0) { + if (no_eras > 0) + { /* Init lambda to be the erasure locator polynomial */ - lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))]; + lambda[1] = ALPHA_TO[MODNN(PRIM * (NN - 1 - eras_pos[0]))]; for (i = 1; i < no_eras; i++) { - u = MODNN(PRIM*(NN-1-eras_pos[i])); - for (j = i+1; j > 0; j--) { - tmp = INDEX_OF[lambda[j - 1]]; - if(tmp != A0) - lambda[j] ^= ALPHA_TO[MODNN(u + tmp)]; + u = MODNN(PRIM * (NN - 1 - eras_pos[i])); + for (j = i + 1; j > 0; j--) { + tmp = INDEX_OF[lambda[j - 1]]; + if (tmp != A0) { + lambda[j] ^= ALPHA_TO[MODNN(u + tmp)]; + } } } #if DEBUG >= 1 /* Test code that verifies the erasure locator polynomial just constructed Needed only for decoder debugging. */ - + /* find roots of the erasure location polynomial */ - for(i=1;i<=no_eras;i++) + for (i = 1; i <= no_eras; i++) { reg[i] = INDEX_OF[lambda[i]]; + } count = 0; - for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) { + for (i = 1, k = IPRIM - 1; i <= NN; i++, k = MODNN(k + IPRIM)) { q = 1; for (j = 1; j <= no_eras; j++) - if (reg[j] != A0) { - reg[j] = MODNN(reg[j] + j); - q ^= ALPHA_TO[reg[j]]; - } - if (q != 0) - continue; + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + if (q != 0) { + continue; + } /* store root and error location number indices */ root[count] = i; loc[count] = k; count++; } if (count != no_eras) { - printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras); + printf("count = %d no_eras = %d\n lambda(x) is WRONG\n", count, no_eras); count = -1; goto finish; } #if DEBUG >= 2 printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n"); - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { printf("%d ", loc[i]); + } printf("\n"); #endif #endif } - for(i=0;i 0; j--){ + for (j = deg_lambda; j > 0; j--) { if (reg[j] != A0) { - reg[j] = MODNN(reg[j] + j); - q ^= ALPHA_TO[reg[j]]; + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; } } - if (q != 0) - continue; /* Not a root */ + if (q != 0) { + continue; /* Not a root */ + } /* store root (index-form) and error location number */ #if DEBUG>=2 - printf("count %d root %d loc %d\n",count,i,k); + printf("count %d root %d loc %d\n", count, i, k); #endif root[count] = i; loc[count] = k; /* If we've already found max possible roots, * abort the search to save time */ - if(++count == deg_lambda) + if (++count == deg_lambda) { break; + } } - if (deg_lambda != count) { + if (deg_lambda != count) + { /* * deg(lambda) unequal to number of roots => uncorrectable * error detected @@ -249,12 +274,14 @@ * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo * x**NROOTS). in index form. Also find deg(omega). */ - deg_omega = deg_lambda-1; - for (i = 0; i <= deg_omega;i++){ + deg_omega = deg_lambda - 1; + for (i = 0; i <= deg_omega; i++) + { tmp = 0; - for(j=i;j >= 0; j--){ - if ((s[i - j] != A0) && (lambda[j] != A0)) - tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])]; + for (j = i; j >= 0; j--) { + if ((s[i - j] != A0) && (lambda[j] != A0)) { + tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])]; + } } omega[i] = INDEX_OF[tmp]; } @@ -263,19 +290,22 @@ * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 = * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form */ - for (j = count-1; j >=0; j--) { + for (j = count - 1; j >= 0; j--) + { num1 = 0; for (i = deg_omega; i >= 0; i--) { - if (omega[i] != A0) - num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])]; + if (omega[i] != A0) { + num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])]; + } } num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)]; den = 0; - + /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */ - for (i = MIN(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) { - if(lambda[i+1] != A0) - den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])]; + for (i = MIN(deg_lambda, NROOTS - 1) & ~1; i >= 0; i -= 2) { + if (lambda[i + 1] != A0) { + den ^= ALPHA_TO[MODNN(lambda[i + 1] + i * root[j])]; + } } #if DEBUG >= 1 if (den == 0) { @@ -286,13 +316,16 @@ #endif /* Apply error to data */ if (num1 != 0 && loc[j] >= PAD) { - data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])]; + data[loc[j] - PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - + INDEX_OF[den])]; } } - finish: - if(eras_pos != NULL){ - for(i=0;i #endif +#include #include #include "fixed.h" -int decode_rs_8(data_t *data, int *eras_pos, int no_eras, int pad){ +SATNOGS_API int decode_rs_8(data_t *data, int *eras_pos, int no_eras, int pad) +{ int retval; - - if(pad < 0 || pad > 222){ + + if (pad < 0 || pad > 222) { return -1; } #include "decode_rs.h" - + return retval; } diff --git a/libfec/decode_rs_ccsds.c b/lib/libfec/decode_rs_ccsds.c similarity index 64% rename from libfec/decode_rs_ccsds.c rename to lib/libfec/decode_rs_ccsds.c index 0e246b4..61204e7 100644 --- a/libfec/decode_rs_ccsds.c +++ b/lib/libfec/decode_rs_ccsds.c @@ -7,20 +7,25 @@ #include "ccsds.h" #include "fec.h" -int decode_rs_ccsds(data_t *data,int *eras_pos,int no_eras,int pad){ - int i,r; +#include + +SATNOGS_API int decode_rs_ccsds(data_t *data, int *eras_pos, int no_eras, int pad) +{ + int i, r; data_t cdata[NN]; /* Convert data from dual basis to conventional */ - for(i=0;i 0){ + if (r > 0) { /* Convert from conventional to dual basis */ - for(i=0;i #endif +#include + #include #include "char.h" #include "rs-common.h" -int decode_rs_char(void *p, data_t *data, int *eras_pos, int no_eras){ +SATNOGS_API int decode_rs_char(void *p, data_t *data, int *eras_pos, int no_eras) +{ int retval; struct rs *rs = (struct rs *)p; - + #include "decode_rs.h" - + return retval; } diff --git a/libfec/decode_rs_int.c b/lib/libfec/decode_rs_int.c similarity index 76% rename from libfec/decode_rs_int.c rename to lib/libfec/decode_rs_int.c index 1ef1a1f..8152220 100644 --- a/libfec/decode_rs_int.c +++ b/lib/libfec/decode_rs_int.c @@ -6,17 +6,18 @@ #ifdef DEBUG #include #endif - +#include #include #include "int.h" #include "rs-common.h" -int decode_rs_int(void *p, data_t *data, int *eras_pos, int no_eras){ +SATNOGS_API int decode_rs_int(void *p, data_t *data, int *eras_pos, int no_eras) +{ int retval; struct rs *rs = (struct rs *)p; - + #include "decode_rs.h" - + return retval; } diff --git a/libfec/dotprod.c b/lib/libfec/dotprod.c similarity index 58% rename from libfec/dotprod.c rename to lib/libfec/dotprod.c index 5fb1da9..ed566f8 100644 --- a/libfec/dotprod.c +++ b/lib/libfec/dotprod.c @@ -6,57 +6,59 @@ #include #include "fec.h" -void *initdp_port(signed short coeffs[],int len); -long dotprod_port(void *p,signed short *b); +void *initdp_port(signed short coeffs[], int len); +long dotprod_port(void *p, signed short *b); void freedp_port(void *p); #ifdef __i386__ -void *initdp_mmx(signed short coeffs[],int len); -void *initdp_sse2(signed short coeffs[],int len); -long dotprod_mmx(void *p,signed short *b); -long dotprod_sse2(void *p,signed short *b); +void *initdp_mmx(signed short coeffs[], int len); +void *initdp_sse2(signed short coeffs[], int len); +long dotprod_mmx(void *p, signed short *b); +long dotprod_sse2(void *p, signed short *b); void freedp_mmx(void *p); void freedp_sse2(void *p); #endif #ifdef __VEC__ -void *initdp_av(signed short coeffs[],int len); -long dotprod_av(void *p,signed short *b); +void *initdp_av(signed short coeffs[], int len); +long dotprod_av(void *p, signed short *b); void freedp_av(void *p); #endif /* Create and return a descriptor for use with the dot product function */ -void *initdp(signed short coeffs[],int len){ +void *initdp(signed short coeffs[], int len) +{ find_cpu_mode(); - switch(Cpu_mode){ + switch (Cpu_mode) { case PORT: default: - return initdp_port(coeffs,len); + return initdp_port(coeffs, len); #ifdef __i386__ case MMX: case SSE: - return initdp_mmx(coeffs,len); + return initdp_mmx(coeffs, len); case SSE2: - return initdp_sse2(coeffs,len); + return initdp_sse2(coeffs, len); #endif #ifdef __x86_64__ case SSE2: - return initdp_port(coeffs,len); + return initdp_port(coeffs, len); #endif #ifdef __VEC__ case ALTIVEC: - return initdp_av(coeffs,len); + return initdp_av(coeffs, len); #endif } } /* Free a dot product descriptor created earlier */ -void freedp(void *p){ - switch(Cpu_mode){ +void freedp(void *p) +{ + switch (Cpu_mode) { case PORT: default: return freedp_port(p); @@ -83,27 +85,28 @@ void freedp(void *p){ /* Compute a dot product given a descriptor and an input array * The length is taken from the descriptor */ -long dotprod(void *p,signed short a[]){ - switch(Cpu_mode){ +long dotprod(void *p, signed short a[]) +{ + switch (Cpu_mode) { case PORT: default: - return dotprod_port(p,a); + return dotprod_port(p, a); #ifdef __i386__ case MMX: case SSE: - return dotprod_mmx(p,a); + return dotprod_mmx(p, a); case SSE2: - return dotprod_sse2(p,a); + return dotprod_sse2(p, a); #endif #ifdef __x86_64__ case SSE2: - return dotprod_port(p,a); + return dotprod_port(p, a); #endif #ifdef __VEC__ case ALTIVEC: - return dotprod_av(p,a); + return dotprod_av(p, a); #endif } } diff --git a/libfec/dotprod.h b/lib/libfec/dotprod.h similarity index 100% rename from libfec/dotprod.h rename to lib/libfec/dotprod.h diff --git a/libfec/dotprod_av.c b/lib/libfec/dotprod_av.c similarity index 55% rename from libfec/dotprod_av.c rename to lib/libfec/dotprod_av.c index 1f70471..bbea8af 100644 --- a/libfec/dotprod_av.c +++ b/lib/libfec/dotprod_av.c @@ -16,77 +16,86 @@ struct dotprod { }; /* Create and return a descriptor for use with the dot product function */ -void *initdp_av(signed short coeffs[],int len){ +void *initdp_av(signed short coeffs[], int len) +{ struct dotprod *dp; - int i,j; + int i, j; - if(len == 0) + if (len == 0) { return NULL; + } - dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp = (struct dotprod *)calloc(1, sizeof(struct dotprod)); dp->len = len; /* Make 8 copies of coefficients, one for each data alignment, * each aligned to 16-byte boundary */ - for(i=0;i<8;i++){ - dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short)); - for(j=0;jcoeffs[i][j+i] = coeffs[j]; + for (i = 0; i < 8; i++) { + dp->coeffs[i] = calloc(1 + (len + i - 1) / 8, sizeof(vector signed short)); + for (j = 0; j < len; j++) { + dp->coeffs[i][j + i] = coeffs[j]; + } } return (void *)dp; } /* Free a dot product descriptor created earlier */ -void freedp_av(void *p){ +void freedp_av(void *p) +{ struct dotprod *dp = (struct dotprod *)p; int i; - for(i=0;i<8;i++) - if(dp->coeffs[i] != NULL) + for (i = 0; i < 8; i++) + if (dp->coeffs[i] != NULL) { free(dp->coeffs[i]); + } free(dp); } /* Compute a dot product given a descriptor and an input array * The length is taken from the descriptor */ -long dotprod_av(void *p,signed short a[]){ +long dotprod_av(void *p, signed short a[]) +{ struct dotprod *dp = (struct dotprod *)p; int al; - vector signed short *ar,*d; - vector signed int sums0,sums1,sums2,sums3; - union { vector signed int v; signed int w[4];} s; + vector signed short *ar, *d; + vector signed int sums0, sums1, sums2, sums3; + union { + vector signed int v; + signed int w[4]; + } s; int nblocks; - + /* round ar down to beginning of 16-byte block containing 0th element of * input buffer. Then set d to one of 8 sets of shifted coefficients */ ar = (vector signed short *)((int)a & ~15); - al = ((int)a & 15)/sizeof(signed short); + al = ((int)a & 15) / sizeof(signed short); d = (vector signed short *)dp->coeffs[al]; - - nblocks = (dp->len+al-1)/8+1; - + + nblocks = (dp->len + al - 1) / 8 + 1; + /* Sum into four vectors each holding four 32-bit partial sums */ sums3 = sums2 = sums1 = sums0 = (vector signed int)(0); - while(nblocks >= 4){ - sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0); - sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1); - sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2); - sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3); + while (nblocks >= 4) { + sums0 = vec_msums(ar[nblocks - 1], d[nblocks - 1], sums0); + sums1 = vec_msums(ar[nblocks - 2], d[nblocks - 2], sums1); + sums2 = vec_msums(ar[nblocks - 3], d[nblocks - 3], sums2); + sums3 = vec_msums(ar[nblocks - 4], d[nblocks - 4], sums3); nblocks -= 4; } - sums0 = vec_adds(sums0,sums1); - sums2 = vec_adds(sums2,sums3); - sums0 = vec_adds(sums0,sums2); - while(nblocks-- > 0){ - sums0 = vec_msums(ar[nblocks],d[nblocks],sums0); + sums0 = vec_adds(sums0, sums1); + sums2 = vec_adds(sums2, sums3); + sums0 = vec_adds(sums0, sums2); + while (nblocks-- > 0) { + sums0 = vec_msums(ar[nblocks], d[nblocks], sums0); } /* Sum 4 partial sums into final result */ - s.v = vec_sums(sums0,(vector signed int)(0)); - + s.v = vec_sums(sums0, (vector signed int)(0)); + return s.w[3]; } diff --git a/libfec/dotprod_mmx.c b/lib/libfec/dotprod_mmx.c similarity index 72% rename from libfec/dotprod_mmx.c rename to lib/libfec/dotprod_mmx.c index c516afe..0b5df0b 100644 --- a/libfec/dotprod_mmx.c +++ b/lib/libfec/dotprod_mmx.c @@ -16,50 +16,56 @@ struct dotprod { */ signed short *coeffs[4]; }; -long dotprod_mmx_assist(signed short *a,signed short *b,int cnt); +long dotprod_mmx_assist(signed short *a, signed short *b, int cnt); /* Create and return a descriptor for use with the dot product function */ -void *initdp_mmx(signed short coeffs[],int len){ +void *initdp_mmx(signed short coeffs[], int len) +{ struct dotprod *dp; - int i,j; + int i, j; - if(len == 0) + if (len == 0) { return NULL; + } - dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp = (struct dotprod *)calloc(1, sizeof(struct dotprod)); dp->len = len; /* Make 4 copies of coefficients, one for each data alignment */ - for(i=0;i<4;i++){ - dp->coeffs[i] = (signed short *)calloc(1+(len+i-1)/4, - 4*sizeof(signed short)); - for(j=0;jcoeffs[i][j+i] = coeffs[j]; + for (i = 0; i < 4; i++) { + dp->coeffs[i] = (signed short *)calloc(1 + (len + i - 1) / 4, + 4 * sizeof(signed short)); + for (j = 0; j < len; j++) { + dp->coeffs[i][j + i] = coeffs[j]; + } } return (void *)dp; } /* Free a dot product descriptor created earlier */ -void freedp_mmx(void *p){ +void freedp_mmx(void *p) +{ struct dotprod *dp = (struct dotprod *)p; int i; - for(i=0;i<4;i++) - if(dp->coeffs[i] != NULL) + for (i = 0; i < 4; i++) + if (dp->coeffs[i] != NULL) { free(dp->coeffs[i]); + } free(dp); } /* Compute a dot product given a descriptor and an input array * The length is taken from the descriptor */ -long dotprod_mmx(void *p,signed short a[]){ +long dotprod_mmx(void *p, signed short a[]) +{ struct dotprod *dp = (struct dotprod *)p; int al; signed short *ar; - + /* Round input data address down to 8 byte boundary * NB: depending on the alignment of a[], memory * before a[] will be accessed. The contents don't matter since they'll @@ -68,14 +74,14 @@ long dotprod_mmx(void *p,signed short a[]){ * in the x86 machines is done on much larger boundaries */ ar = (signed short *)((int)a & ~7); - + /* Choose one of 4 sets of pre-shifted coefficients. al is both the * index into dp->coeffs[] and the number of 0 words padded onto * that coefficients array for alignment purposes */ al = a - ar; - + /* Call assembler routine to do the work, passing number of 4-word blocks */ - return dotprod_mmx_assist(ar,dp->coeffs[al],(dp->len+al-1)/4+1); + return dotprod_mmx_assist(ar, dp->coeffs[al], (dp->len + al - 1) / 4 + 1); } diff --git a/libfec/dotprod_mmx_assist.s b/lib/libfec/dotprod_mmx_assist.s similarity index 100% rename from libfec/dotprod_mmx_assist.s rename to lib/libfec/dotprod_mmx_assist.s diff --git a/libfec/dotprod_port.c b/lib/libfec/dotprod_port.c similarity index 69% rename from libfec/dotprod_port.c rename to lib/libfec/dotprod_port.c index ef635ec..ad3ed96 100644 --- a/libfec/dotprod_port.c +++ b/lib/libfec/dotprod_port.c @@ -13,43 +13,49 @@ struct dotprod { }; /* Create and return a descriptor for use with the dot product function */ -void *initdp_port(signed short coeffs[],int len){ +void *initdp_port(signed short coeffs[], int len) +{ struct dotprod *dp; int j; - if(len == 0) + if (len == 0) { return NULL; + } - dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp = (struct dotprod *)calloc(1, sizeof(struct dotprod)); dp->len = len; /* Just one copy of the coefficients for the C version */ - dp->coeffs = (signed short *)calloc(len,sizeof(signed short)); - for(j=0;jcoeffs = (signed short *)calloc(len, sizeof(signed short)); + for (j = 0; j < len; j++) { dp->coeffs[j] = coeffs[j]; + } return (void *)dp; } /* Free a dot product descriptor created earlier */ -void freedp_port(void *p){ +void freedp_port(void *p) +{ struct dotprod *dp = (struct dotprod *)p; - if(dp->coeffs != NULL) - free(dp->coeffs); + if (dp->coeffs != NULL) { + free(dp->coeffs); + } free(dp); } /* Compute a dot product given a descriptor and an input array * The length is taken from the descriptor */ -long dotprod_port(void *p,signed short a[]){ +long dotprod_port(void *p, signed short a[]) +{ struct dotprod *dp = (struct dotprod *)p; long corr; int i; corr = 0; - for(i=0;ilen;i++){ + for (i = 0; i < dp->len; i++) { corr += (long)a[i] * dp->coeffs[i]; } return corr; diff --git a/libfec/dotprod_sse2.c b/lib/libfec/dotprod_sse2.c similarity index 64% rename from libfec/dotprod_sse2.c rename to lib/libfec/dotprod_sse2.c index 1fddd18..39157e2 100644 --- a/libfec/dotprod_sse2.c +++ b/lib/libfec/dotprod_sse2.c @@ -18,55 +18,61 @@ struct dotprod { signed short *coeffs[8]; }; -long dotprod_sse2_assist(signed short *a,signed short *b,int cnt); +long dotprod_sse2_assist(signed short *a, signed short *b, int cnt); /* Create and return a descriptor for use with the dot product function */ -void *initdp_sse2(signed short coeffs[],int len){ +void *initdp_sse2(signed short coeffs[], int len) +{ struct dotprod *dp; - int i,j,blksize; + int i, j, blksize; - if(len == 0) + if (len == 0) { return NULL; + } - dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp = (struct dotprod *)calloc(1, sizeof(struct dotprod)); dp->len = len; /* Make 8 copies of coefficients, one for each data alignment, * each aligned to 16-byte boundary */ - for(i=0;i<8;i++){ - blksize = (1+(len+i-1)/8) * 8*sizeof(signed short); - posix_memalign((void **)&dp->coeffs[i],16,blksize); - memset(dp->coeffs[i],0,blksize); - for(j=0;jcoeffs[i][j+i] = coeffs[j]; + for (i = 0; i < 8; i++) { + blksize = (1 + (len + i - 1) / 8) * 8 * sizeof(signed short); + posix_memalign((void **)&dp->coeffs[i], 16, blksize); + memset(dp->coeffs[i], 0, blksize); + for (j = 0; j < len; j++) { + dp->coeffs[i][j + i] = coeffs[j]; + } } return (void *)dp; } /* Free a dot product descriptor created earlier */ -void freedp_sse2(void *p){ +void freedp_sse2(void *p) +{ struct dotprod *dp = (struct dotprod *)p; int i; - for(i=0;i<8;i++) - if(dp->coeffs[i] != NULL) + for (i = 0; i < 8; i++) + if (dp->coeffs[i] != NULL) { free(dp->coeffs[i]); + } free(dp); } /* Compute a dot product given a descriptor and an input array * The length is taken from the descriptor */ -long dotprod_sse2(void *p,signed short a[]){ +long dotprod_sse2(void *p, signed short a[]) +{ struct dotprod *dp = (struct dotprod *)p; int al; signed short *ar; - + ar = (signed short *)((int)a & ~15); al = a - ar; - + /* Call assembler routine to do the work, passing number of 8-word blocks */ - return dotprod_sse2_assist(ar,dp->coeffs[al],(dp->len+al-1)/8+1); + return dotprod_sse2_assist(ar, dp->coeffs[al], (dp->len + al - 1) / 8 + 1); } diff --git a/libfec/dotprod_sse2_assist.s b/lib/libfec/dotprod_sse2_assist.s similarity index 100% rename from libfec/dotprod_sse2_assist.s rename to lib/libfec/dotprod_sse2_assist.s diff --git a/libfec/dsp.3 b/lib/libfec/dsp.3 similarity index 100% rename from libfec/dsp.3 rename to lib/libfec/dsp.3 diff --git a/libfec/dtest.c b/lib/libfec/dtest.c similarity index 56% rename from libfec/dtest.c rename to lib/libfec/dtest.c index 394cb03..58d2f83 100644 --- a/libfec/dtest.c +++ b/lib/libfec/dtest.c @@ -12,28 +12,29 @@ #if HAVE_GETOPT_LONG struct option Options[] = { - {"force-altivec",0,NULL,'a'}, - {"force-port",0,NULL,'p'}, - {"force-mmx",0,NULL,'m'}, - {"force-sse",0,NULL,'s'}, - {"force-sse2",0,NULL,'t'}, - {"trials",0,NULL,'n'}, + {"force-altivec", 0, NULL, 'a'}, + {"force-port", 0, NULL, 'p'}, + {"force-mmx", 0, NULL, 'm'}, + {"force-sse", 0, NULL, 's'}, + {"force-sse2", 0, NULL, 't'}, + {"trials", 0, NULL, 'n'}, {NULL}, }; #endif -int main(int argc,char *argv[]){ +int main(int argc, char *argv[]) +{ short coeffs[512]; short input[2048]; - int trials=1000,d; + int trials = 1000, d; int errors = 0; #if HAVE_GETOPT_LONG - while((d = getopt_long(argc,argv,"apmstn:",Options,NULL)) != EOF){ + while ((d = getopt_long(argc, argv, "apmstn:", Options, NULL)) != EOF) { #else - while((d = getopt(argc,argv,"apmstn:")) != EOF){ + while ((d = getopt(argc, argv, "apmstn:")) != EOF) { #endif - switch(d){ + switch (d) { case 'a': Cpu_mode = ALTIVEC; break; @@ -55,45 +56,47 @@ int main(int argc,char *argv[]){ } } - while(trials--){ + while (trials--) { long port_result; long simd_result; int ntaps; int i; int csum = 0; int offset; - void *dp_simd,*dp_port; + void *dp_simd, *dp_port; /* Generate set of coefficients * limit sum of absolute values to 32767 to avoid overflow */ - memset(coeffs,0,sizeof(coeffs)); - for(i=0;i<512;i++){ + memset(coeffs, 0, sizeof(coeffs)); + for (i = 0; i < 512; i++) { double gv; - gv = normal_rand(0.,100.); - if(csum + fabs(gv) > 32767) - break; + gv = normal_rand(0., 100.); + if (csum + fabs(gv) > 32767) { + break; + } coeffs[i] = gv; csum += fabs(gv); } ntaps = i; /* Compare results to portable C version for a bunch of random data buffers and offsets */ - dp_simd = initdp(coeffs,ntaps); - dp_port = initdp_port(coeffs,ntaps); - - for(i=0;i<2048;i++) + dp_simd = initdp(coeffs, ntaps); + dp_port = initdp_port(coeffs, ntaps); + + for (i = 0; i < 2048; i++) { input[i] = random(); - + } + offset = random() & 511; - simd_result = dotprod(dp_simd,input+offset); - port_result = dotprod_port(dp_port,input+offset); - if(simd_result != port_result){ + simd_result = dotprod(dp_simd, input + offset); + port_result = dotprod_port(dp_port, input + offset); + if (simd_result != port_result) { errors++; } } - printf("dtest: %d errors\n",errors); + printf("dtest: %d errors\n", errors); exit(0); } diff --git a/libfec/encode_rs.c b/lib/libfec/encode_rs.c similarity index 57% rename from libfec/encode_rs.c rename to lib/libfec/encode_rs.c index 0649094..ccc855b 100644 --- a/libfec/encode_rs.c +++ b/lib/libfec/encode_rs.c @@ -14,9 +14,11 @@ void ENCODE_RS( #ifdef FIXED -data_t *data, data_t *bb,int pad){ + data_t *data, data_t *bb, int pad) +{ #else -void *p,data_t *data, data_t *bb){ + void *p, data_t *data, data_t *bb) +{ struct rs *rs = (struct rs *)p; #endif int i, j; @@ -24,29 +26,33 @@ void *p,data_t *data, data_t *bb){ #ifdef FIXED /* Check pad parameter for validity */ - if(pad < 0 || pad >= NN) + if (pad < 0 || pad >= NN) { return; + } #endif - memset(bb,0,NROOTS*sizeof(data_t)); + memset(bb, 0, NROOTS * sizeof(data_t)); - for(i=0;i +#include +#include "fixed.h" + +/* Lookup table for feedback multiplications + * These are the low half of the coefficients. Since the generator polynomial is + * palindromic, we form it by reversing these on the fly + */ +static union { + vector unsigned char v; + unsigned char c[16]; +} table[256]; + +static vector unsigned char reverse = (vector unsigned char)(0, 15, 14, 13, 12, + 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); +static vector unsigned char shift_right = (vector unsigned char)(15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30); + +extern data_t CCSDS_alpha_to[]; +extern data_t CCSDS_index_of[]; +extern data_t CCSDS_poly[]; + +void rs_init_av() +{ + int i, j; + + /* The PowerPC is big-endian, so the low-order byte of each vector contains the highest order term in the polynomial */ + for (j = 0; j < 16; j++) { + table[0].c[j] = 0; + for (i = 1; i < 256; i++) { + table[i].c[16 - j - 1] = CCSDS_alpha_to[MODNN(CCSDS_poly[j + 1] + + CCSDS_index_of[i])]; + } + } +#if 0 + for (i = 0; i < 256; i++) { + printf("table[%3d] = %3vu\n", i, table[i].v); + } +#endif +} + +void encode_rs_av(unsigned char *data, unsigned char *parity, int pad) +{ + union { + vector unsigned char v[2]; + unsigned char c[32]; + } shift_register; + int i; + + shift_register.v[0] = (vector unsigned char)(0); + shift_register.v[1] = (vector unsigned char)(0); + + for (i = 0; i < NN - NROOTS - pad; i++) { + vector unsigned char feedback0, feedback1; + unsigned char f; + + f = data[i] ^ shift_register.c[31]; + feedback1 = table[f].v; + feedback0 = vec_perm(feedback1, feedback1, reverse); + + /* Shift right one byte */ + shift_register.v[1] = vec_perm(shift_register.v[0], shift_register.v[1], + shift_right) ^ feedback1; + shift_register.v[0] = vec_sro(shift_register.v[0], + (vector unsigned char)(8)) ^ feedback0; + shift_register.c[0] = f; + } + for (i = 0; i < NROOTS; i++) { + parity[NROOTS - i - 1] = shift_register.c[i]; + } +} diff --git a/libfec/encode_rs_ccsds.c b/lib/libfec/encode_rs_ccsds.c similarity index 70% rename from libfec/encode_rs_ccsds.c rename to lib/libfec/encode_rs_ccsds.c index 5a2ec70..91d641a 100644 --- a/libfec/encode_rs_ccsds.c +++ b/lib/libfec/encode_rs_ccsds.c @@ -8,17 +8,20 @@ #include "ccsds.h" #include "fec.h" -void encode_rs_ccsds(data_t *data,data_t *parity,int pad){ +void encode_rs_ccsds(data_t *data, data_t *parity, int pad) +{ int i; - data_t cdata[NN-NROOTS]; + data_t cdata[NN - NROOTS]; /* Convert data from dual basis to conventional */ - for(i=0;i +#include +#include + +#ifdef FIXED +#include "fixed.h" +#define EXERCISE exercise_8 +#elif defined(CCSDS) +#include "fixed.h" +#include "ccsds.h" +#define EXERCISE exercise_ccsds +#elif defined(BIGSYM) +#include "int.h" +#define EXERCISE exercise_int +#else +#include "char.h" +#define EXERCISE exercise_char +#endif + +#ifdef FIXED +#define PRINTPARM printf("(255,223):"); +#elif defined(CCSDS) +#define PRINTPARM printf("CCSDS (255,223):"); +#else +#define PRINTPARM printf("(%d,%d):",rs->nn,rs->nn-rs->nroots); +#endif + +/* Exercise the RS codec passed as an argument */ +int EXERCISE( +#if !defined(CCSDS) && !defined(FIXED) + void *p, +#endif + int trials) +{ +#if !defined(CCSDS) && !defined(FIXED) + struct rs *rs = (struct rs *)p; +#endif + data_t block[NN], tblock[NN]; + int i; + int errors; + int errlocs[NN]; + int derrlocs[NROOTS]; + int derrors; + int errval, errloc; + int erasures; + int decoder_errors = 0; + + while (trials-- != 0) { + /* Test up to the error correction capacity of the code */ + for (errors = 0; errors <= NROOTS / 2; errors++) { + + /* Load block with random data and encode */ + for (i = 0; i < NN - NROOTS; i++) { + block[i] = random() & NN; + } + +#if defined(CCSDS) || defined(FIXED) + ENCODE_RS(&block[0], &block[NN - NROOTS], 0); +#else + ENCODE_RS(rs, &block[0], &block[NN - NROOTS]); +#endif + + /* Make temp copy, seed with errors */ + memcpy(tblock, block, sizeof(tblock)); + memset(errlocs, 0, sizeof(errlocs)); + memset(derrlocs, 0, sizeof(derrlocs)); + erasures = 0; + for (i = 0; i < errors; i++) { + do { + errval = random() & NN; + } + while (errval == 0); /* Error value must be nonzero */ + + do { + errloc = random() % NN; + } + while (errlocs[errloc] != 0); /* Must not choose the same location twice */ + + errlocs[errloc] = 1; + +#if FLAG_ERASURE + if (random() & 1) { /* 50-50 chance */ + derrlocs[erasures++] = errloc; + } +#endif + tblock[errloc] ^= errval; + } + + /* Decode the errored block */ +#if defined(CCSDS) || defined(FIXED) + derrors = DECODE_RS(tblock, derrlocs, erasures, 0); +#else + derrors = DECODE_RS(rs, tblock, derrlocs, erasures); +#endif + + if (derrors != errors) { + PRINTPARM + printf(" decoder says %d errors, true number is %d\n", derrors, errors); + decoder_errors++; + } + for (i = 0; i < derrors; i++) { + if (errlocs[derrlocs[i]] == 0) { + PRINTPARM + printf(" decoder indicates error in location %d without error\n", derrlocs[i]); + decoder_errors++; + } + } + if (memcmp(tblock, block, sizeof(tblock)) != 0) { + PRINTPARM + printf(" uncorrected errors! output ^ input:"); + decoder_errors++; + for (i = 0; i < NN; i++) { + printf(" %02x", tblock[i] ^ block[i]); + } + printf("\n"); + } + } + } + return decoder_errors; +} diff --git a/lib/libfec/fec.c b/lib/libfec/fec.c new file mode 100644 index 0000000..45a71b9 --- /dev/null +++ b/lib/libfec/fec.c @@ -0,0 +1,68 @@ +/* Utility routines for FEC support + * Copyright 2004, Phil Karn, KA9Q + */ + +#include +#include "fec.h" + +unsigned char Partab[256]; +int P_init; + +/* Create 256-entry odd-parity lookup table + * Needed only on non-ia32 machines + */ +void partab_init(void) +{ + int i, cnt, ti; + + /* Initialize parity lookup table */ + for (i = 0; i < 256; i++) { + cnt = 0; + ti = i; + while (ti) { + if (ti & 1) { + cnt++; + } + ti >>= 1; + } + Partab[i] = cnt & 1; + } + P_init = 1; +} + +/* Lookup table giving count of 1 bits for integers 0-255 */ +int Bitcnt[] = { + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8, +}; + diff --git a/libfec/fixed.h b/lib/libfec/fixed.h similarity index 95% rename from libfec/fixed.h rename to lib/libfec/fixed.h index 0ff27b2..f503283 100644 --- a/libfec/fixed.h +++ b/lib/libfec/fixed.h @@ -7,7 +7,8 @@ */ typedef unsigned char data_t; -static inline int mod255(int x){ +static inline int mod255(int x) +{ while (x >= 255) { x -= 255; x = (x >> 8) + (x & 255); diff --git a/libfec/gen_ccsds.c b/lib/libfec/gen_ccsds.c similarity index 59% rename from libfec/gen_ccsds.c rename to lib/libfec/gen_ccsds.c index e1e2e26..70568db 100644 --- a/libfec/gen_ccsds.c +++ b/lib/libfec/gen_ccsds.c @@ -9,30 +9,34 @@ #include "rs-common.h" #include "fec.h" -int main(){ +int main() +{ struct rs *rs; int i; - rs = init_rs_char(8,0x187,112,11,32,0); /* CCSDS standard */ + rs = init_rs_char(8, 0x187, 112, 11, 32, 0); /* CCSDS standard */ assert(rs != NULL); printf("char CCSDS_alpha_to[] = {"); - for(i=0;i<256;i++){ - if((i % 16) == 0) + for (i = 0; i < 256; i++) { + if ((i % 16) == 0) { printf("\n"); - printf("0x%02x,",rs->alpha_to[i]); + } + printf("0x%02x,", rs->alpha_to[i]); } printf("\n};\n\nchar CCSDS_index_of[] = {"); - for(i=0;i<256;i++){ - if((i % 16) == 0) + for (i = 0; i < 256; i++) { + if ((i % 16) == 0) { printf("\n"); - printf("%3d,",rs->index_of[i]); + } + printf("%3d,", rs->index_of[i]); } printf("\n};\n\nchar CCSDS_poly[] = {"); - for(i=0;i<33;i++){ - if((i % 16) == 0) + for (i = 0; i < 33; i++) { + if ((i % 16) == 0) { printf("\n"); + } - printf("%3d,",rs->genpoly[i]); + printf("%3d,", rs->genpoly[i]); } printf("\n};\n"); exit(0); diff --git a/libfec/gen_ccsds_tal.c b/lib/libfec/gen_ccsds_tal.c similarity index 67% rename from libfec/gen_ccsds_tal.c rename to lib/libfec/gen_ccsds_tal.c index fc75503..9d86bbc 100644 --- a/libfec/gen_ccsds_tal.c +++ b/lib/libfec/gen_ccsds_tal.c @@ -14,7 +14,7 @@ #include #define DTYPE unsigned char -DTYPE Taltab[256],Tal1tab[256]; +DTYPE Taltab[256], Tal1tab[256]; static DTYPE tal[] = { 0x8d, 0xef, 0xec, 0x86, 0xfa, 0x99, 0xaf, 0x7b }; @@ -23,29 +23,33 @@ static DTYPE tal[] = { 0x8d, 0xef, 0xec, 0x86, 0xfa, 0x99, 0xaf, 0x7b }; * and Berlekamp's dual basis representation * (l0, l1, ...l7) */ -int main(){ - int i,j,k; +int main() +{ + int i, j, k; - for(i=0;i<256;i++){/* For each value of input */ + for (i = 0; i < 256; i++) { /* For each value of input */ Taltab[i] = 0; - for(j=0;j<8;j++) /* for each column of matrix */ - for(k=0;k<8;k++){ /* for each row of matrix */ - if(i & (1<alpha_to); @@ -29,8 +30,9 @@ void free_rs(void *p){ * nroots = RS code generator polynomial degree (number of roots) * pad = padding bytes at front of shortened block */ -void *init_rs_common(int symsize,int gfpoly,int fcr,int prim, - int nroots,int pad){ +void *init_rs_common(int symsize, int gfpoly, int fcr, int prim, + int nroots, int pad) +{ struct rs *rs; #include "init_rs.h" diff --git a/libfec/init_rs.h b/lib/libfec/init_rs.h similarity index 51% rename from libfec/init_rs.h rename to lib/libfec/init_rs.h index 2b2ae98..abbca80 100644 --- a/libfec/init_rs.h +++ b/lib/libfec/init_rs.h @@ -6,39 +6,52 @@ #define NULL ((void *)0) { - int i, j, sr,root,iprim; + int i, j, sr, root, iprim; rs = NULL; /* Check parameter ranges */ - if(symsize < 0 || symsize > 8*sizeof(data_t)){ + if (symsize < 0 || symsize > 8 * sizeof(data_t)) + { goto done; } - if(fcr < 0 || fcr >= (1<= (1 << symsize)) + { goto done; - if(prim <= 0 || prim >= (1<= (1 << symsize)) + { goto done; - if(nroots < 0 || nroots >= (1<= ((1<= (1 << symsize)) + { + goto done; /* Can't have more roots than symbol values! */ + } + if (pad < 0 || pad >= ((1 << symsize) - 1 - nroots)) + { + goto done; /* Too much padding */ + } - rs = (struct rs *)calloc(1,sizeof(struct rs)); - if(rs == NULL) + rs = (struct rs *)calloc(1, sizeof(struct rs)); + if (rs == NULL) + { goto done; + } rs->mm = symsize; - rs->nn = (1<nn = (1 << symsize) - 1; rs->pad = pad; - rs->alpha_to = (data_t *)malloc(sizeof(data_t)*(rs->nn+1)); - if(rs->alpha_to == NULL){ + rs->alpha_to = (data_t *)malloc(sizeof(data_t) * (rs->nn + 1)); + if (rs->alpha_to == NULL) + { free(rs); rs = NULL; goto done; } - rs->index_of = (data_t *)malloc(sizeof(data_t)*(rs->nn+1)); - if(rs->index_of == NULL){ + rs->index_of = (data_t *)malloc(sizeof(data_t) * (rs->nn + 1)); + if (rs->index_of == NULL) + { free(rs->alpha_to); free(rs); rs = NULL; @@ -49,15 +62,18 @@ rs->index_of[0] = A0; /* log(zero) = -inf */ rs->alpha_to[A0] = 0; /* alpha**-inf = 0 */ sr = 1; - for(i=0;inn;i++){ + for (i = 0; i < rs->nn; i++) + { rs->index_of[sr] = i; rs->alpha_to[i] = sr; sr <<= 1; - if(sr & (1<nn; } - if(sr != 1){ + if (sr != 1) + { /* field generator polynomial is not primitive! */ free(rs->alpha_to); free(rs->index_of); @@ -67,8 +83,9 @@ } /* Form RS code generator polynomial from its roots */ - rs->genpoly = (data_t *)malloc(sizeof(data_t)*(nroots+1)); - if(rs->genpoly == NULL){ + rs->genpoly = (data_t *)malloc(sizeof(data_t) * (nroots + 1)); + if (rs->genpoly == NULL) + { free(rs->alpha_to); free(rs->index_of); free(rs); @@ -80,27 +97,34 @@ rs->nroots = nroots; /* Find prim-th root of 1, used in decoding */ - for(iprim=1;(iprim % prim) != 0;iprim += rs->nn) + for (iprim = 1; (iprim % prim) != 0; iprim += rs->nn) ; rs->iprim = iprim / prim; rs->genpoly[0] = 1; - for (i = 0,root=fcr*prim; i < nroots; i++,root += prim) { - rs->genpoly[i+1] = 1; + for (i = 0, root = fcr *prim; i < nroots; i++, root += prim) + { + rs->genpoly[i + 1] = 1; /* Multiply rs->genpoly[] by @**(root + x) */ - for (j = i; j > 0; j--){ - if (rs->genpoly[j] != 0) - rs->genpoly[j] = rs->genpoly[j-1] ^ rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[j]] + root)]; - else - rs->genpoly[j] = rs->genpoly[j-1]; + for (j = i; j > 0; j--) { + if (rs->genpoly[j] != 0) { + rs->genpoly[j] = rs->genpoly[j - 1] ^ rs->alpha_to[modnn(rs, + rs->index_of[rs->genpoly[j]] + root)]; + } + else { + rs->genpoly[j] = rs->genpoly[j - 1]; + } } /* rs->genpoly[0] can never be zero */ - rs->genpoly[0] = rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[0]] + root)]; + rs->genpoly[0] = rs->alpha_to[modnn(rs, rs->index_of[rs->genpoly[0]] + root)]; } /* convert rs->genpoly[] to index form for quicker encoding */ for (i = 0; i <= nroots; i++) + { rs->genpoly[i] = rs->index_of[rs->genpoly[i]]; - done:; + } +done: + ; } diff --git a/libfec/init_rs_char.c b/lib/libfec/init_rs_char.c similarity index 84% rename from libfec/init_rs_char.c rename to lib/libfec/init_rs_char.c index a51099a..48a9532 100644 --- a/libfec/init_rs_char.c +++ b/lib/libfec/init_rs_char.c @@ -8,7 +8,8 @@ #include "char.h" #include "rs-common.h" -void free_rs_char(void *p){ +void free_rs_char(void *p) +{ struct rs *rs = (struct rs *)p; free(rs->alpha_to); @@ -25,8 +26,9 @@ void free_rs_char(void *p){ * nroots = RS code generator polynomial degree (number of roots) * pad = padding bytes at front of shortened block */ -void *init_rs_char(int symsize,int gfpoly,int fcr,int prim, - int nroots,int pad){ +void *init_rs_char(int symsize, int gfpoly, int fcr, int prim, + int nroots, int pad) +{ struct rs *rs; #include "init_rs.h" diff --git a/libfec/init_rs_int.c b/lib/libfec/init_rs_int.c similarity index 84% rename from libfec/init_rs_int.c rename to lib/libfec/init_rs_int.c index a6036c2..6abb5cf 100644 --- a/libfec/init_rs_int.c +++ b/lib/libfec/init_rs_int.c @@ -8,7 +8,8 @@ #include "int.h" #include "rs-common.h" -void free_rs_int(void *p){ +void free_rs_int(void *p) +{ struct rs *rs = (struct rs *)p; free(rs->alpha_to); @@ -25,8 +26,9 @@ void free_rs_int(void *p){ * nroots = RS code generator polynomial degree (number of roots) * pad = padding bytes at front of shortened block */ -void *init_rs_int(int symsize,int gfpoly,int fcr,int prim, - int nroots,int pad){ +void *init_rs_int(int symsize, int gfpoly, int fcr, int prim, + int nroots, int pad) +{ struct rs *rs; #include "init_rs.h" diff --git a/libfec/install-sh b/lib/libfec/install-sh similarity index 100% rename from libfec/install-sh rename to lib/libfec/install-sh diff --git a/libfec/int.h b/lib/libfec/int.h similarity index 93% rename from libfec/int.h rename to lib/libfec/int.h index 46e865d..50d8fe3 100644 --- a/libfec/int.h +++ b/lib/libfec/int.h @@ -9,7 +9,7 @@ typedef unsigned int data_t; #define MM (rs->mm) #define NN (rs->nn) -#define ALPHA_TO (rs->alpha_to) +#define ALPHA_TO (rs->alpha_to) #define INDEX_OF (rs->index_of) #define GENPOLY (rs->genpoly) #define NROOTS (rs->nroots) diff --git a/libfec/lesser.txt b/lib/libfec/lesser.txt similarity index 100% rename from libfec/lesser.txt rename to lib/libfec/lesser.txt diff --git a/libfec/libfec.pc.in b/lib/libfec/libfec.pc.in similarity index 100% rename from libfec/libfec.pc.in rename to lib/libfec/libfec.pc.in diff --git a/libfec/makefile.in b/lib/libfec/makefile.in similarity index 100% rename from libfec/makefile.in rename to lib/libfec/makefile.in diff --git a/libfec/mmxbfly27.s b/lib/libfec/mmxbfly27.s similarity index 100% rename from libfec/mmxbfly27.s rename to lib/libfec/mmxbfly27.s diff --git a/libfec/mmxbfly29.s b/lib/libfec/mmxbfly29.s similarity index 100% rename from libfec/mmxbfly29.s rename to lib/libfec/mmxbfly29.s diff --git a/libfec/peak_mmx_assist.s b/lib/libfec/peak_mmx_assist.s similarity index 100% rename from libfec/peak_mmx_assist.s rename to lib/libfec/peak_mmx_assist.s diff --git a/libfec/peak_sse2_assist.s b/lib/libfec/peak_sse2_assist.s similarity index 100% rename from libfec/peak_sse2_assist.s rename to lib/libfec/peak_sse2_assist.s diff --git a/libfec/peak_sse_assist.s b/lib/libfec/peak_sse_assist.s similarity index 100% rename from libfec/peak_sse_assist.s rename to lib/libfec/peak_sse_assist.s diff --git a/libfec/peaktest.c b/lib/libfec/peaktest.c similarity index 52% rename from libfec/peaktest.c rename to lib/libfec/peaktest.c index fa4b280..742dc29 100644 --- a/libfec/peaktest.c +++ b/lib/libfec/peaktest.c @@ -9,30 +9,33 @@ #define NSAMP 200002 #define OFFSET 1 -int peakval(signed short *,int); -int peakval_port(signed short *,int); +int peakval(signed short *, int); +int peakval_port(signed short *, int); -int main(){ - int i,s; - int result,rresult; +int main() +{ + int i, s; + int result, rresult; signed short samples[NSAMP]; srandom(time(NULL)); - for(i=0;i +#include "fec.h" + +int peakval_port(signed short *b, int cnt); +#ifdef __i386__ +int peakval_mmx(signed short *b, int cnt); +int peakval_sse(signed short *b, int cnt); +int peakval_sse2(signed short *b, int cnt); +#endif + +#ifdef __x86_64__ +int peakval_sse2(signed short *b, int cnt); +#endif + +#ifdef __VEC__ +int peakval_av(signed short *b, int cnt); +#endif + +int peakval(signed short *b, int cnt) +{ + find_cpu_mode(); + + switch (Cpu_mode) { + case PORT: + default: + return peakval_port(b, cnt); +#ifdef __i386__ + case MMX: + return peakval_mmx(b, cnt); + case SSE: + return peakval_sse(b, cnt); + case SSE2: + return peakval_sse2(b, cnt); +#endif + +#ifdef __x86_64__ + case SSE2: + return peakval_port(b, cnt); + //return peakval_sse2(b,cnt); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return peakval_av(b, cnt); +#endif + } +} diff --git a/lib/libfec/peakval_av.c b/lib/libfec/peakval_av.c new file mode 100644 index 0000000..3779f16 --- /dev/null +++ b/lib/libfec/peakval_av.c @@ -0,0 +1,67 @@ +/* Return the largest absolute value of a vector of signed shorts + + * This is the Altivec SIMD version. + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include "fec.h" + +signed short peakval_av(signed short *in, int cnt) +{ + vector signed short x; + int pad; + union { + vector signed char cv; + vector signed short hv; + signed short s[8]; + signed char c[16]; + } s; + vector signed short smallest, largest; + + smallest = (vector signed short)(0); + largest = (vector signed short)(0); + if ((pad = (int)in & 15) != 0) { + /* Load unaligned leading word */ + x = vec_perm(vec_ld(0, in), (vector signed short)(0), vec_lvsl(0, in)); + if (cnt < 8) { /* Shift right to chop stuff beyond end of short block */ + s.c[15] = (8 - cnt) << 4; + x = vec_sro(x, s.cv); + } + smallest = vec_min(smallest, x); + largest = vec_max(largest, x); + in += 8 - pad / 2; + cnt -= 8 - pad / 2; + } + /* Everything is now aligned, rip through most of the block */ + while (cnt >= 8) { + x = vec_ld(0, in); + smallest = vec_min(smallest, x); + largest = vec_max(largest, x); + in += 8; + cnt -= 8; + } + /* Handle trailing fragment, if any */ + if (cnt > 0) { + x = vec_ld(0, in); + s.c[15] = (8 - cnt) << 4; + x = vec_sro(x, s.cv); + smallest = vec_min(smallest, x); + largest = vec_max(largest, x); + } + /* Combine and extract result */ + largest = vec_max(largest, vec_abs(smallest)); + + s.c[15] = 64; /* Shift right four 16-bit words */ + largest = vec_max(largest, vec_sro(largest, s.cv)); + + s.c[15] = 32; /* Shift right two 16-bit words */ + largest = vec_max(largest, vec_sro(largest, s.cv)); + + s.c[15] = 16; /* Shift right one 16-bit word */ + largest = vec_max(largest, vec_sro(largest, s.cv)); + + s.hv = largest; + return s.s[7]; +} diff --git a/libfec/peakval_mmx.c b/lib/libfec/peakval_mmx.c similarity index 53% rename from libfec/peakval_mmx.c rename to lib/libfec/peakval_mmx.c index 436fe88..c8a4a62 100644 --- a/libfec/peakval_mmx.c +++ b/lib/libfec/peakval_mmx.c @@ -4,31 +4,35 @@ #include -int peakval_mmx_assist(signed short *,int); +int peakval_mmx_assist(signed short *, int); -int peakval_mmx(signed short *b,int cnt){ +int peakval_mmx(signed short *b, int cnt) +{ int peak = 0; int a; - while(((int)b & 7) != 0 && cnt != 0){ + while (((int)b & 7) != 0 && cnt != 0) { a = abs(*b); - if(a > peak) + if (a > peak) { peak = a; + } b++; cnt--; } - a = peakval_mmx_assist(b,cnt); - if(a > peak) + a = peakval_mmx_assist(b, cnt); + if (a > peak) { peak = a; + } b += cnt & ~3; cnt &= 3; - while(cnt != 0){ + while (cnt != 0) { a = abs(*b); - if(a > peak) + if (a > peak) { peak = a; + } b++; cnt--; } return peak; -} +} diff --git a/libfec/peakval_mmx_assist.s b/lib/libfec/peakval_mmx_assist.s similarity index 100% rename from libfec/peakval_mmx_assist.s rename to lib/libfec/peakval_mmx_assist.s diff --git a/libfec/peakval_port.c b/lib/libfec/peakval_port.c similarity index 61% rename from libfec/peakval_port.c rename to lib/libfec/peakval_port.c index 07ab316..c410eeb 100644 --- a/libfec/peakval_port.c +++ b/lib/libfec/peakval_port.c @@ -3,14 +3,16 @@ */ #include #include "fec.h" -int peakval_port(signed short *b,int len){ +int peakval_port(signed short *b, int len) +{ int peak = 0; - int a,i; + int a, i; - for(i=0;i peak) + if (a > peak) { peak = a; + } } return peak; } diff --git a/libfec/peakval_sse.c b/lib/libfec/peakval_sse.c similarity index 54% rename from libfec/peakval_sse.c rename to lib/libfec/peakval_sse.c index 9868b7f..ba6c525 100644 --- a/libfec/peakval_sse.c +++ b/lib/libfec/peakval_sse.c @@ -5,31 +5,35 @@ #include #include "fec.h" -int peakval_sse_assist(signed short *,int); +int peakval_sse_assist(signed short *, int); -int peakval_sse(signed short *b,int cnt){ +int peakval_sse(signed short *b, int cnt) +{ int peak = 0; int a; - while(((int)b & 7) != 0 && cnt != 0){ + while (((int)b & 7) != 0 && cnt != 0) { a = abs(*b); - if(a > peak) + if (a > peak) { peak = a; + } b++; cnt--; } - a = peakval_sse_assist(b,cnt); - if(a > peak) + a = peakval_sse_assist(b, cnt); + if (a > peak) { peak = a; + } b += cnt & ~3; cnt &= 3; - while(cnt != 0){ + while (cnt != 0) { a = abs(*b); - if(a > peak) + if (a > peak) { peak = a; + } b++; cnt--; } return peak; -} +} diff --git a/libfec/peakval_sse2.c b/lib/libfec/peakval_sse2.c similarity index 53% rename from libfec/peakval_sse2.c rename to lib/libfec/peakval_sse2.c index 79d9059..b29d161 100644 --- a/libfec/peakval_sse2.c +++ b/lib/libfec/peakval_sse2.c @@ -4,31 +4,35 @@ #include #include "fec.h" -int peakval_sse2_assist(signed short *,int); +int peakval_sse2_assist(signed short *, int); -int peakval_sse2(signed short *b,int cnt){ +int peakval_sse2(signed short *b, int cnt) +{ int peak = 0; int a; - while(((int)b & 15) != 0 && cnt != 0){ + while (((int)b & 15) != 0 && cnt != 0) { a = abs(*b); - if(a > peak) + if (a > peak) { peak = a; + } b++; cnt--; } - a = peakval_sse2_assist(b,cnt); - if(a > peak) + a = peakval_sse2_assist(b, cnt); + if (a > peak) { peak = a; + } b += cnt & ~7; cnt &= 7; - while(cnt != 0){ + while (cnt != 0) { a = abs(*b); - if(a > peak) + if (a > peak) { peak = a; + } b++; cnt--; } return peak; -} +} diff --git a/libfec/peakval_sse2_assist.s b/lib/libfec/peakval_sse2_assist.s similarity index 100% rename from libfec/peakval_sse2_assist.s rename to lib/libfec/peakval_sse2_assist.s diff --git a/libfec/peakval_sse_assist.s b/lib/libfec/peakval_sse_assist.s similarity index 100% rename from libfec/peakval_sse_assist.s rename to lib/libfec/peakval_sse_assist.s diff --git a/libfec/rs-common.h b/lib/libfec/rs-common.h similarity index 94% rename from libfec/rs-common.h rename to lib/libfec/rs-common.h index e64eb39..de02d46 100644 --- a/libfec/rs-common.h +++ b/lib/libfec/rs-common.h @@ -17,7 +17,8 @@ struct rs { int pad; /* Padding bytes in shortened block */ }; -static inline int modnn(struct rs *rs,int x){ +static inline int modnn(struct rs *rs, int x) +{ while (x >= rs->nn) { x -= rs->nn; x = (x >> rs->mm) + (x & rs->nn); diff --git a/libfec/rs.3 b/lib/libfec/rs.3 similarity index 100% rename from libfec/rs.3 rename to lib/libfec/rs.3 diff --git a/lib/libfec/rs_speedtest.c b/lib/libfec/rs_speedtest.c new file mode 100644 index 0000000..1b0c072 --- /dev/null +++ b/lib/libfec/rs_speedtest.c @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include +#include +#include "fec.h" + +int main() +{ + unsigned char block[255]; + int i; + void *rs; + struct rusage start, finish; + double extime; + int trials = 10000; + + for (i = 0; i < 223; i++) { + block[i] = 0x01; + } + + rs = init_rs_char(8, 0x187, 112, 11, 32, 0); + encode_rs_char(rs, block, &block[223]); + + getrusage(RUSAGE_SELF, &start); + for (i = 0; i < trials; i++) { +#if 0 + block[0] ^= 0xff; /* Introduce an error */ + block[2] ^= 0xff; /* Introduce an error */ +#endif + decode_rs_char(rs, block, NULL, 0); + } + getrusage(RUSAGE_SELF, &finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6 * + (finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + + printf("Execution time for %d Reed-Solomon blocks using general decoder: %.2f sec\n", + trials, extime); + printf("decoder speed: %g bits/s\n", trials * 223 * 8 / extime); + + + encode_rs_8(block, &block[223], 0); + getrusage(RUSAGE_SELF, &start); + for (i = 0; i < trials; i++) { +#if 0 + block[0] ^= 0xff; /* Introduce an error */ + block[2] ^= 0xff; /* Introduce an error */ +#endif + decode_rs_8(block, NULL, 0, 0); + } + getrusage(RUSAGE_SELF, &finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6 * + (finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d Reed-Solomon blocks using CCSDS decoder: %.2f sec\n", + trials, extime); + printf("decoder speed: %g bits/s\n", trials * 223 * 8 / extime); + + exit(0); +} + diff --git a/lib/libfec/rstest.c b/lib/libfec/rstest.c new file mode 100644 index 0000000..a62c73d --- /dev/null +++ b/lib/libfec/rstest.c @@ -0,0 +1,324 @@ +/* Test the Reed-Solomon codecs + * for various block sizes and with random data and random error patterns + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include +#include +#include +#include +#include "fec.h" + + +struct etab { + int symsize; + int genpoly; + int fcs; + int prim; + int nroots; + int ntrials; +} Tab[] = { + {2, 0x7, 1, 1, 1, 10 }, + {3, 0xb, 1, 1, 2, 10 }, + {4, 0x13, 1, 1, 4, 10 }, + {5, 0x25, 1, 1, 6, 10 }, + {6, 0x43, 1, 1, 8, 10 }, + {7, 0x89, 1, 1, 10, 10 }, + {8, 0x11d, 1, 1, 32, 10 }, + {8, 0x187, 112, 11, 32, 10 }, /* Duplicates CCSDS codec */ + {9, 0x211, 1, 1, 32, 10 }, + {10, 0x409, 1, 1, 32, 10 }, + {11, 0x805, 1, 1, 32, 10 }, + {12, 0x1053, 1, 1, 32, 5 }, + {13, 0x201b, 1, 1, 32, 2 }, + {14, 0x4443, 1, 1, 32, 1 }, + {15, 0x8003, 1, 1, 32, 1 }, + {16, 0x1100b, 1, 1, 32, 1 }, + {0, 0, 0, 0, 0}, +}; + +int exercise_char(struct etab *e); +int exercise_int(struct etab *e); +int exercise_8(void); + +int main() +{ + int i; + + srandom(time(NULL)); + + printf("Testing fixed CCSDS encoder...\n"); + exercise_8(); + for (i = 0; Tab[i].symsize != 0; i++) { + int nn, kk; + + nn = (1 << Tab[i].symsize) - 1; + kk = nn - Tab[i].nroots; + printf("Testing (%d,%d) code...\n", nn, kk); + if (Tab[i].symsize <= 8) { + exercise_char(&Tab[i]); + } + else { + exercise_int(&Tab[i]); + } + } + exit(0); +} + +int exercise_8(void) +{ + int nn = 255; + unsigned char block[nn], tblock[nn]; + int errlocs[nn], derrlocs[nn]; + int i; + int errors; + int derrors, kk; + int errval, errloc; + int erasures; + int decoder_errors = 0; + + /* Compute code parameters */ + kk = 223; + + + /* Test up to the error correction capacity of the code */ + for (errors = 0; errors <= (nn - kk) / 2; errors++) { + + /* Load block with random data and encode */ + for (i = 0; i < kk; i++) { + block[i] = random() & nn; + } + memcpy(tblock, block, sizeof(block)); + encode_rs_8(block, &block[kk], 0); + + /* Make temp copy, seed with errors */ + memcpy(tblock, block, sizeof(block)); + memset(errlocs, 0, sizeof(errlocs)); + memset(derrlocs, 0, sizeof(derrlocs)); + erasures = 0; + for (i = 0; i < errors; i++) { + do { + errval = random() & nn; + } + while (errval == 0); /* Error value must be nonzero */ + + do { + errloc = random() % nn; + } + while (errlocs[errloc] != 0); /* Must not choose the same location twice */ + + errlocs[errloc] = 1; + +#if FLAG_ERASURE + if (random() & 1) { /* 50-50 chance */ + derrlocs[erasures++] = errloc; + } +#endif + tblock[errloc] ^= errval; + } + + /* Decode the errored block */ + derrors = decode_rs_8(tblock, derrlocs, erasures, 0); + + if (derrors != errors) { + printf("(%d,%d) decoder says %d errors, true number is %d\n", nn, kk, derrors, + errors); + decoder_errors++; + } + for (i = 0; i < derrors; i++) { + if (errlocs[derrlocs[i]] == 0) { + printf("(%d,%d) decoder indicates error in location %d without error\n", nn, kk, + derrlocs[i]); + decoder_errors++; + } + } + if (memcmp(tblock, block, sizeof(tblock)) != 0) { + printf("(%d,%d) decoder uncorrected errors! output ^ input:", nn, kk); + decoder_errors++; + for (i = 0; i < nn; i++) { + printf(" %02x", tblock[i] ^ block[i]); + } + printf("\n"); + } + } + return decoder_errors; +} + + +int exercise_char(struct etab *e) +{ + int nn = (1 << e->symsize) - 1; + unsigned char block[nn], tblock[nn]; + int errlocs[nn], derrlocs[nn]; + int i; + int errors; + int derrors, kk; + int errval, errloc; + int erasures; + int decoder_errors = 0; + void *rs; + + if (e->symsize > 8) { + return -1; + } + + /* Compute code parameters */ + kk = nn - e->nroots; + + rs = init_rs_char(e->symsize, e->genpoly, e->fcs, e->prim, e->nroots, 0); + if (rs == NULL) { + printf("init_rs_char failed!\n"); + return -1; + } + /* Test up to the error correction capacity of the code */ + for (errors = 0; errors <= e->nroots / 2; errors++) { + + /* Load block with random data and encode */ + for (i = 0; i < kk; i++) { + block[i] = random() & nn; + } + memcpy(tblock, block, sizeof(block)); + encode_rs_char(rs, block, &block[kk]); + + /* Make temp copy, seed with errors */ + memcpy(tblock, block, sizeof(block)); + memset(errlocs, 0, sizeof(errlocs)); + memset(derrlocs, 0, sizeof(derrlocs)); + erasures = 0; + for (i = 0; i < errors; i++) { + do { + errval = random() & nn; + } + while (errval == 0); /* Error value must be nonzero */ + + do { + errloc = random() % nn; + } + while (errlocs[errloc] != 0); /* Must not choose the same location twice */ + + errlocs[errloc] = 1; + +#if FLAG_ERASURE + if (random() & 1) { /* 50-50 chance */ + derrlocs[erasures++] = errloc; + } +#endif + tblock[errloc] ^= errval; + } + + /* Decode the errored block */ + derrors = decode_rs_char(rs, tblock, derrlocs, erasures); + + if (derrors != errors) { + printf("(%d,%d) decoder says %d errors, true number is %d\n", nn, kk, derrors, + errors); + decoder_errors++; + } + for (i = 0; i < derrors; i++) { + if (errlocs[derrlocs[i]] == 0) { + printf("(%d,%d) decoder indicates error in location %d without error\n", nn, kk, + derrlocs[i]); + decoder_errors++; + } + } + if (memcmp(tblock, block, sizeof(tblock)) != 0) { + printf("(%d,%d) decoder uncorrected errors! output ^ input:", nn, kk); + decoder_errors++; + for (i = 0; i < nn; i++) { + printf(" %02x", tblock[i] ^ block[i]); + } + printf("\n"); + } + } + + free_rs_char(rs); + return 0; +} + +int exercise_int(struct etab *e) +{ + int nn = (1 << e->symsize) - 1; + int block[nn], tblock[nn]; + int errlocs[nn], derrlocs[nn]; + int i; + int errors; + int derrors, kk; + int errval, errloc; + int erasures; + int decoder_errors = 0; + void *rs; + + /* Compute code parameters */ + kk = nn - e->nroots; + + rs = init_rs_int(e->symsize, e->genpoly, e->fcs, e->prim, e->nroots, 0); + if (rs == NULL) { + printf("init_rs_int failed!\n"); + return -1; + } + /* Test up to the error correction capacity of the code */ + for (errors = 0; errors <= e->nroots / 2; errors++) { + + /* Load block with random data and encode */ + for (i = 0; i < kk; i++) { + block[i] = random() & nn; + } + memcpy(tblock, block, sizeof(block)); + encode_rs_int(rs, block, &block[kk]); + + /* Make temp copy, seed with errors */ + memcpy(tblock, block, sizeof(block)); + memset(errlocs, 0, sizeof(errlocs)); + memset(derrlocs, 0, sizeof(derrlocs)); + erasures = 0; + for (i = 0; i < errors; i++) { + do { + errval = random() & nn; + } + while (errval == 0); /* Error value must be nonzero */ + + do { + errloc = random() % nn; + } + while (errlocs[errloc] != 0); /* Must not choose the same location twice */ + + errlocs[errloc] = 1; + +#if FLAG_ERASURE + if (random() & 1) { /* 50-50 chance */ + derrlocs[erasures++] = errloc; + } +#endif + tblock[errloc] ^= errval; + } + + /* Decode the errored block */ + derrors = decode_rs_int(rs, tblock, derrlocs, erasures); + + if (derrors != errors) { + printf("(%d,%d) decoder says %d errors, true number is %d\n", nn, kk, derrors, + errors); + decoder_errors++; + } + for (i = 0; i < derrors; i++) { + if (errlocs[derrlocs[i]] == 0) { + printf("(%d,%d) decoder indicates error in location %d without error\n", nn, kk, + derrlocs[i]); + decoder_errors++; + } + } + if (memcmp(tblock, block, sizeof(tblock)) != 0) { + printf("(%d,%d) decoder uncorrected errors! output ^ input:", nn, kk); + decoder_errors++; + for (i = 0; i < nn; i++) { + printf(" %02x", tblock[i] ^ block[i]); + } + printf("\n"); + } + } + + free_rs_int(rs); + return 0; +} diff --git a/libfec/sim.c b/lib/libfec/sim.c similarity index 54% rename from libfec/sim.c rename to lib/libfec/sim.c index 151b04c..8f25459 100644 --- a/libfec/sim.c +++ b/lib/libfec/sim.c @@ -2,19 +2,19 @@ #include #include "fec.h" -#define MAX_RANDOM 0x7fffffff +#define MAX_RANDOM 0x7fffffff /* Generate gaussian random double with specified mean and std_dev */ double normal_rand(double mean, double std_dev) { - double fac,rsq,v1,v2; + double fac, rsq, v1, v2; static double gset; static int iset; - if(iset){ + if (iset) { /* Already got one */ iset = 0; - return mean + std_dev*gset; + return mean + std_dev * gset; } /* Generate two evenly distributed numbers between -1 and +1 * that are inside the unit circle @@ -22,22 +22,27 @@ double normal_rand(double mean, double std_dev) do { v1 = 2.0 * (double)random() / MAX_RANDOM - 1; v2 = 2.0 * (double)random() / MAX_RANDOM - 1; - rsq = v1*v1 + v2*v2; - } while(rsq >= 1.0 || rsq == 0.0); - fac = sqrt(-2.0*log(rsq)/rsq); - gset = v1*fac; + rsq = v1 * v1 + v2 * v2; + } + while (rsq >= 1.0 || rsq == 0.0); + fac = sqrt(-2.0 * log(rsq) / rsq); + gset = v1 * fac; iset++; - return mean + std_dev*v2*fac; + return mean + std_dev * v2 * fac; } -unsigned char addnoise(int sym,double amp,double gain,double offset,int clip){ +unsigned char addnoise(int sym, double amp, double gain, double offset, + int clip) +{ int sample; - - sample = offset + gain*normal_rand(sym?amp:-amp,1.0); + + sample = offset + gain * normal_rand(sym ? amp : -amp, 1.0); /* Clip to 8-bit offset range */ - if(sample < 0) + if (sample < 0) { sample = 0; - else if(sample > clip) + } + else if (sample > clip) { sample = clip; + } return sample; } diff --git a/libfec/simd-viterbi.3 b/lib/libfec/simd-viterbi.3 similarity index 100% rename from libfec/simd-viterbi.3 rename to lib/libfec/simd-viterbi.3 diff --git a/lib/libfec/sqtest.c b/lib/libfec/sqtest.c new file mode 100644 index 0000000..c990bd8 --- /dev/null +++ b/lib/libfec/sqtest.c @@ -0,0 +1,46 @@ +/* Verify correctness of the sum-of-square routines */ +#include +#include +#include + +/* These values should trigger leading/trailing array fragment handling */ +#define NSAMP 200002 +#define OFFSET 1 + +long long sumsq_wq(signed short *in, int cnt); +long long sumsq_wq_ref(signed short *in, int cnt); + +int main() +{ + int i; + long long result, rresult; + signed short samples[NSAMP]; + + srandom(time(NULL)); + + for (i = 0; i < NSAMP; i++) { + samples[i] = random() & 0xffff; + } + + rresult = sumsq_wq(&samples[OFFSET], NSAMP - OFFSET); + result = sumsq_wq(&samples[OFFSET], NSAMP - OFFSET); + if (result == rresult) { + printf("OK\n"); + } + else { + printf("sum mismatch: %lld != %lld\n", result, rresult); + } + exit(0); +} + +long long sumsq_wq_ref(signed short *in, int cnt) +{ + long long sum = 0; + int i; + + for (i = 0; i < cnt; i++) { + sum += (long)in[i] * in[i]; + } + return sum; +} + diff --git a/libfec/sse2bfly27-64.s b/lib/libfec/sse2bfly27-64.s similarity index 100% rename from libfec/sse2bfly27-64.s rename to lib/libfec/sse2bfly27-64.s diff --git a/libfec/sse2bfly27.s b/lib/libfec/sse2bfly27.s similarity index 100% rename from libfec/sse2bfly27.s rename to lib/libfec/sse2bfly27.s diff --git a/libfec/sse2bfly29-64.s b/lib/libfec/sse2bfly29-64.s similarity index 100% rename from libfec/sse2bfly29-64.s rename to lib/libfec/sse2bfly29-64.s diff --git a/libfec/sse2bfly29.s b/lib/libfec/sse2bfly29.s similarity index 100% rename from libfec/sse2bfly29.s rename to lib/libfec/sse2bfly29.s diff --git a/libfec/ssebfly27.s b/lib/libfec/ssebfly27.s similarity index 100% rename from libfec/ssebfly27.s rename to lib/libfec/ssebfly27.s diff --git a/libfec/ssebfly29.s b/lib/libfec/ssebfly29.s similarity index 100% rename from libfec/ssebfly29.s rename to lib/libfec/ssebfly29.s diff --git a/lib/libfec/sumsq.c b/lib/libfec/sumsq.c new file mode 100644 index 0000000..c411d2a --- /dev/null +++ b/lib/libfec/sumsq.c @@ -0,0 +1,51 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include +#include "fec.h" + +unsigned long long sumsq_port(signed short *, int); + +#ifdef __i386__ +unsigned long long sumsq_mmx(signed short *, int); +unsigned long long sumsq_sse(signed short *, int); +unsigned long long sumsq_sse2(signed short *, int); +#endif + +#ifdef __x86_64__ +unsigned long long sumsq_sse2(signed short *, int); +#endif + +#ifdef __VEC__ +unsigned long long sumsq_av(signed short *, int); +#endif + +unsigned long long sumsq(signed short *in, int cnt) +{ + switch (Cpu_mode) { + case PORT: + default: + return sumsq_port(in, cnt); +#ifdef __i386__ + case SSE: + case MMX: + return sumsq_mmx(in, cnt); + case SSE2: + return sumsq_sse2(in, cnt); +#endif + +#ifdef __x86_64__ + case SSE2: + return sumsq_port(in, cnt); + //return sumsq_sse2(in,cnt); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return sumsq_av(in, cnt); +#endif + } +} diff --git a/lib/libfec/sumsq_av.c b/lib/libfec/sumsq_av.c new file mode 100644 index 0000000..bf3bba0 --- /dev/null +++ b/lib/libfec/sumsq_av.c @@ -0,0 +1,84 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * This is the Altivec SIMD version. It's a little hairy because Altivec + * does not do 64-bit operations directly, so we have to accumulate separate + * 32-bit sums and carries + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include "fec.h" + +unsigned long long sumsq_av(signed short *in, int cnt) +{ + long long sum; + vector signed short x; + vector unsigned int sums, carries, s1, s2; + int pad; + union { + vector unsigned char cv; + vector unsigned int iv; + unsigned int w[4]; + unsigned char c[16]; + } s; + + carries = sums = (vector unsigned int)(0); + if ((pad = (int)in & 15) != 0) { + /* Load unaligned leading word */ + x = vec_perm(vec_ld(0, in), (vector signed short)(0), vec_lvsl(0, in)); + if (cnt < 8) { /* Shift right to chop stuff beyond end of short block */ + s.c[15] = (8 - cnt) << 4; + x = vec_sro(x, s.cv); + } + sums = (vector unsigned int)vec_msum(x, x, (vector signed int)(0)); + in += 8 - pad / 2; + cnt -= 8 - pad / 2; + } + /* Everything is now aligned, rip through most of the block */ + while (cnt >= 8) { + x = vec_ld(0, in); + /* A single vec_msum cannot overflow, but we have to sum it with + * the earlier terms separately to handle the carries + * The cast to unsigned is OK because squares are always positive + */ + s1 = (vector unsigned int)vec_msum(x, x, (vector signed int)(0)); + carries = vec_add(carries, vec_addc(sums, s1)); + sums = vec_add(sums, s1); + in += 8; + cnt -= 8; + } + /* Handle trailing fragment, if any */ + if (cnt > 0) { + x = vec_ld(0, in); + s.c[15] = (8 - cnt) << 4; + x = vec_sro(x, s.cv); + s1 = (vector unsigned int)vec_msum(x, x, (vector signed int)(0)); + carries = vec_add(carries, vec_addc(sums, s1)); + sums = vec_add(sums, s1); + } + /* Combine 4 sub-sums and carries */ + s.c[15] = 64; /* Shift right two 32-bit words */ + s1 = vec_sro(sums, s.cv); + s2 = vec_sro(carries, s.cv); + carries = vec_add(carries, vec_addc(sums, s1)); + sums = vec_add(sums, s1); + carries = vec_add(carries, s2); + + s.c[15] = 32; /* Shift right one 32-bit word */ + s1 = vec_sro(sums, s.cv); + s2 = vec_sro(carries, s.cv); + carries = vec_add(carries, vec_addc(sums, s1)); + sums = vec_add(sums, s1); + carries = vec_add(carries, s2); + + /* Extract sum and carries from right-hand words and combine into result */ + s.iv = sums; + sum = s.w[3]; + + s.iv = carries; + sum += (long long)s.w[3] << 32; + + return sum; +} + diff --git a/libfec/sumsq_mmx.c b/lib/libfec/sumsq_mmx.c similarity index 76% rename from libfec/sumsq_mmx.c rename to lib/libfec/sumsq_mmx.c index e766831..4ef9f82 100644 --- a/libfec/sumsq_mmx.c +++ b/lib/libfec/sumsq_mmx.c @@ -10,23 +10,24 @@ * May be used under the terms of the GNU Lesser Public License (LGPL) */ -long long sumsq_mmx_assist(signed short *,int); +long long sumsq_mmx_assist(signed short *, int); -long long sumsq_mmx(signed short *in,int cnt){ +long long sumsq_mmx(signed short *in, int cnt) +{ long long sum = 0; /* Handle stuff before the next 8-byte boundary */ - while(((int)in & 7) != 0 && cnt != 0){ + while (((int)in & 7) != 0 && cnt != 0) { sum += (long)in[0] * in[0]; in++; cnt--; } - sum += sumsq_mmx_assist(in,cnt); + sum += sumsq_mmx_assist(in, cnt); in += cnt & ~7; cnt &= 7; /* Handle up to 7 words at end */ - while(cnt != 0){ + while (cnt != 0) { sum += (long)in[0] * in[0]; in++; cnt--; diff --git a/libfec/sumsq_mmx_assist.s b/lib/libfec/sumsq_mmx_assist.s similarity index 100% rename from libfec/sumsq_mmx_assist.s rename to lib/libfec/sumsq_mmx_assist.s diff --git a/libfec/sumsq_port.c b/lib/libfec/sumsq_port.c similarity index 76% rename from libfec/sumsq_port.c rename to lib/libfec/sumsq_port.c index 6d0b4c1..3d32b02 100644 --- a/libfec/sumsq_port.c +++ b/lib/libfec/sumsq_port.c @@ -5,11 +5,12 @@ * May be used under the terms of the GNU Lesser General Public License (LGPL) */ -unsigned long long sumsq_port(signed short *in,int cnt){ +unsigned long long sumsq_port(signed short *in, int cnt) +{ long long sum = 0; int i; - for(i=0;i +#include +#include +#include +#include "config.h" +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length", 1, NULL, 'l'}, + {"frame-count", 1, NULL, 'n'}, + {"verbose", 0, NULL, 'v'}, + {"force-altivec", 0, NULL, 'a'}, + {"force-port", 0, NULL, 'p'}, + {"force-mmx", 0, NULL, 'm'}, + {"force-sse", 0, NULL, 's'}, + {"force-sse2", 0, NULL, 't'}, + {NULL}, +}; +#endif + +int Verbose = 0; + +int main(int argc, char *argv[]) +{ + signed short *buf; + int i, d, trial, trials = 10000; + int bufsize = 2048; + long long port_sum, simd_sum; + time_t t; + int timetrials = 0; + + find_cpu_mode(); + time(&t); + srandom(t); + +#if HAVE_GETOPT_LONG + while ((d = getopt_long(argc, argv, "vapmstl:n:T", Options, NULL)) != EOF) { +#else + while ((d = getopt(argc, argv, "vapmstl:n:T")) != EOF) { +#endif + switch (d) { + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + bufsize = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'v': + Verbose++; + break; + case 'T': + timetrials++; + break; + } + } + + buf = (signed short *)calloc(bufsize, sizeof(signed short)); + if (timetrials) { + for (trial = 0; trial < trials; trial++) { + (void)sumsq(buf, bufsize); + } + } + else { + for (trial = 0; trial < trials; trial++) { + int length, offset; + + offset = random() & 7; + length = (random() % bufsize) - offset; + if (length <= 0) { + continue; + } + for (i = 0; i < bufsize; i++) { + buf[i] = random(); + } + + port_sum = sumsq_port(buf + offset, length); + simd_sum = sumsq(buf + offset, length); + if (port_sum != simd_sum) { + printf("offset %d len %d port_sum = %lld simd_sum = %lld ", offset, length, + port_sum, simd_sum); + + printf("ERROR! diff = %lld\n", simd_sum - port_sum); + } + } + } + exit(0); +} diff --git a/lib/libfec/viterbi27.c b/lib/libfec/viterbi27.c new file mode 100644 index 0000000..fc25a14 --- /dev/null +++ b/lib/libfec/viterbi27.c @@ -0,0 +1,195 @@ +/* K=7 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27(int len) +{ + find_cpu_mode(); + + switch (Cpu_mode) { + case PORT: + default: + return create_viterbi27_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi27_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi27_mmx(len); + case SSE: + return create_viterbi27_sse(len); + case SSE2: + return create_viterbi27_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi27_port(len); +#endif + } +} + +void set_viterbi27_polynomial(int polys[2]) +{ + switch (Cpu_mode) { + case PORT: + default: + set_viterbi27_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi27_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi27_polynomial_mmx(polys); + break; + case SSE: + set_viterbi27_polynomial_sse(polys); + break; + case SSE2: + set_viterbi27_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi27_polynomial_port(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27(void *p, int starting_state) +{ + switch (Cpu_mode) { + case PORT: + default: + return init_viterbi27_port(p, starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi27_av(p, starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi27_mmx(p, starting_state); + case SSE: + return init_viterbi27_sse(p, starting_state); + case SSE2: + return init_viterbi27_sse2(p, starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi27_port(p, starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi27( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + + switch (Cpu_mode) { + case PORT: + default: + return chainback_viterbi27_port(p, data, nbits, endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi27_av(p, data, nbits, endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi27_mmx(p, data, nbits, endstate); + case SSE: + return chainback_viterbi27_sse(p, data, nbits, endstate); + case SSE2: + return chainback_viterbi27_sse2(p, data, nbits, endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi27_port(p, data, nbits, endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27(void *p) +{ + switch (Cpu_mode) { + case PORT: + default: + delete_viterbi27_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi27_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi27_mmx(p); + break; + case SSE: + delete_viterbi27_sse(p); + break; + case SSE2: + delete_viterbi27_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi27_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi27_blk(void *p, unsigned char syms[], int nbits) +{ + if (p == NULL) { + return -1; + } + + switch (Cpu_mode) { + case PORT: + default: + update_viterbi27_blk_port(p, syms, nbits); + break; +#ifdef __VEC__ + case ALTIVEC: + update_viterbi27_blk_av(p, syms, nbits); + break; +#endif +#ifdef __i386__ + case MMX: + update_viterbi27_blk_mmx(p, syms, nbits); + break; + case SSE: + update_viterbi27_blk_sse(p, syms, nbits); + break; + case SSE2: + update_viterbi27_blk_sse2(p, syms, nbits); + break; +#endif +#ifdef __x86_64__ + case SSE2: + update_viterbi27_blk_port(p, syms, nbits); + break; +#endif + } + return 0; +} diff --git a/lib/libfec/viterbi27_av.c b/lib/libfec/viterbi27_av.c new file mode 100644 index 0000000..ff63119 --- /dev/null +++ b/lib/libfec/viterbi27_av.c @@ -0,0 +1,238 @@ +/* K=7 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec instructions + * Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +typedef union { + long long p; + unsigned char c[64]; + vector bool char v[4]; +} decision_t; +typedef union { + long long p; + unsigned char c[64]; + vector unsigned char v[4]; +} metric_t; + +static union branchtab27 { + unsigned char c[32]; + vector unsigned char v[2]; +} Branchtab27[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_av(void *p, int starting_state) +{ + struct v27 *vp = p; + int i; + + if (p == NULL) { + return -1; + } + for (i = 0; i < 4; i++) { + vp->metrics1.v[i] = (vector unsigned char)(63); + } + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_av(int polys[2]) +{ + int state; + + for (state = 0; state < 32; state++) { + Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2 * state) & abs( + polys[0])) ? 255 : 0; + Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2 * state) & abs( + polys[1])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_av(int len) +{ + struct v27 *vp; + + if (!Init) { + int polys[2] = { V27POLYA, V27POLYB }; + set_viterbi27_polynomial_av(polys); + } + if ((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL) { + return NULL; + } + if ((vp->decisions = (decision_t *)malloc((len + 6) * sizeof( + decision_t))) == NULL) { + free(vp); + return NULL; + } + init_viterbi27_av(vp, 0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + struct v27 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + + if (p == NULL) { + return -1; + } + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while (nbits-- != 0) { + int k; + + k = d[nbits].c[endstate >> 2] & 1; + data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_av(void *p) +{ + struct v27 *vp = p; + + if (vp != NULL) { + free(vp->decisions); + free(vp); + } +} + +/* Process received symbols */ +int update_viterbi27_blk_av(void *p, unsigned char *syms, int nbits) +{ + struct v27 *vp = p; + decision_t *d; + + if (p == NULL) { + return -1; + } + d = (decision_t *)vp->dp; + while (nbits--) { + vector unsigned char survivor0, survivor1, sym0v, sym1v; + vector bool char decision0, decision1; + vector unsigned char metric, m_metric, m0, m1, m2, m3; + void *tmp; + + /* sym0v.0 = syms[0]; sym0v.1 = syms[1] */ + sym0v = vec_perm(vec_ld(0, syms), vec_ld(1, syms), vec_lvsl(0, syms)); + + sym1v = vec_splat(sym0v, 1); /* Splat syms[1] across sym1v */ + sym0v = vec_splat(sym0v, 0); /* Splat syms[0] across sym0v */ + syms += 2; + + /* Do the 32 butterflies as two interleaved groups of 16 each to keep the pipes full */ + + /* Form first set of 16 branch metrics */ + metric = vec_avg(vec_xor(Branchtab27[0].v[0], sym0v), + vec_xor(Branchtab27[1].v[0], sym1v)); + metric = vec_sr(metric, (vector unsigned char)(3)); + m_metric = vec_sub((vector unsigned char)(31), metric); + + /* Form first set of path metrics */ + m0 = vec_adds(vp->old_metrics->v[0], metric); + m3 = vec_adds(vp->old_metrics->v[2], metric); + m1 = vec_adds(vp->old_metrics->v[2], m_metric); + m2 = vec_adds(vp->old_metrics->v[0], m_metric); + + /* Form second set of 16 branch metrics */ + metric = vec_avg(vec_xor(Branchtab27[0].v[1], sym0v), + vec_xor(Branchtab27[1].v[1], sym1v)); + metric = vec_sr(metric, (vector unsigned char)(3)); + m_metric = vec_sub((vector unsigned char)(31), metric); + + /* Compare and select first set */ + decision0 = vec_cmpgt(m0, m1); + decision1 = vec_cmpgt(m2, m3); + survivor0 = vec_min(m0, m1); + survivor1 = vec_min(m2, m3); + + /* Compute second set of path metrics */ + m0 = vec_adds(vp->old_metrics->v[1], metric); + m3 = vec_adds(vp->old_metrics->v[3], metric); + m1 = vec_adds(vp->old_metrics->v[3], m_metric); + m2 = vec_adds(vp->old_metrics->v[1], m_metric); + + /* Interleave and store first decisions and survivors */ + d->v[0] = vec_mergeh(decision0, decision1); + d->v[1] = vec_mergel(decision0, decision1); + vp->new_metrics->v[0] = vec_mergeh(survivor0, survivor1); + vp->new_metrics->v[1] = vec_mergel(survivor0, survivor1); + + /* Compare and select second set */ + decision0 = vec_cmpgt(m0, m1); + decision1 = vec_cmpgt(m2, m3); + survivor0 = vec_min(m0, m1); + survivor1 = vec_min(m2, m3); + + /* Interleave and store second set of decisions and survivors */ + d->v[2] = vec_mergeh(decision0, decision1); + d->v[3] = vec_mergel(decision0, decision1); + vp->new_metrics->v[2] = vec_mergeh(survivor0, survivor1); + vp->new_metrics->v[3] = vec_mergel(survivor0, survivor1); + + /* renormalize if necessary */ + if (vp->new_metrics->c[0] >= 105) { + vector unsigned char scale0, scale1; + + /* Find smallest metric and splat */ + scale0 = vec_min(vp->new_metrics->v[0], vp->new_metrics->v[1]); + scale1 = vec_min(vp->new_metrics->v[2], vp->new_metrics->v[3]); + scale0 = vec_min(scale0, scale1); + scale0 = vec_min(scale0, vec_sld(scale0, scale0, 8)); + scale0 = vec_min(scale0, vec_sld(scale0, scale0, 4)); + scale0 = vec_min(scale0, vec_sld(scale0, scale0, 2)); + scale0 = vec_min(scale0, vec_sld(scale0, scale0, 1)); + + /* Now subtract from all metrics */ + vp->new_metrics->v[0] = vec_subs(vp->new_metrics->v[0], scale0); + vp->new_metrics->v[1] = vec_subs(vp->new_metrics->v[1], scale0); + vp->new_metrics->v[2] = vec_subs(vp->new_metrics->v[2], scale0); + vp->new_metrics->v[3] = vec_subs(vp->new_metrics->v[3], scale0); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + + return 0; +} + diff --git a/libfec/viterbi27_mmx.c b/lib/libfec/viterbi27_mmx.c similarity index 50% rename from libfec/viterbi27_mmx.c rename to lib/libfec/viterbi27_mmx.c index a6d5125..204ad06 100644 --- a/libfec/viterbi27_mmx.c +++ b/lib/libfec/viterbi27_mmx.c @@ -7,11 +7,17 @@ #include #include "fec.h" -typedef union { char c[64]; __m64 v[8];} decision_t; -typedef union { unsigned char c[64]; __m64 v[8];} metric_t; +typedef union { + char c[64]; + __m64 v[8]; +} decision_t; +typedef union { + unsigned char c[64]; + __m64 v[8]; +} metric_t; -unsigned char Mettab27_1[256][32] __attribute__ ((aligned(16))); -unsigned char Mettab27_2[256][32] __attribute__ ((aligned(16))); +unsigned char Mettab27_1[256][32] __attribute__((aligned(16))); +unsigned char Mettab27_2[256][32] __attribute__((aligned(16))); static int Init = 0; /* State info for instance of Viterbi decoder @@ -21,19 +27,23 @@ struct v27 { metric_t metrics1; /* path metric buffer 1 */ metric_t metrics2; /* path metric buffer 2 */ decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ decision_t *decisions; /* Beginning of decisions for block */ }; /* Initialize Viterbi decoder for start of new frame */ -int init_viterbi27_mmx(void *p,int starting_state){ +int init_viterbi27_mmx(void *p, int starting_state) +{ struct v27 *vp = (struct v27 *)p; int i; - if(p == NULL) + if (p == NULL) { return -1; - for(i=0;i<64;i++) + } + for (i = 0; i < 64; i++) { vp->metrics1.c[i] = 63; + } vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; @@ -42,19 +52,20 @@ int init_viterbi27_mmx(void *p,int starting_state){ return 0; } -void set_viterbi27_polynomial_mmx(int polys[2]){ +void set_viterbi27_polynomial_mmx(int polys[2]) +{ int state; - for(state=0;state < 32;state++){ + for (state = 0; state < 32; state++) { int symbol; - for(symbol = 0;symbol < 256;symbol++){ + for (symbol = 0; symbol < 256; symbol++) { int sym; - sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0); - Mettab27_1[symbol][state] = (sym ? (255-symbol):symbol) / 16; + sym = parity((2 * state) & abs(polys[0])) ^ (polys[0] < 0); + Mettab27_1[symbol][state] = (sym ? (255 - symbol) : symbol) / 16; - sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0); - Mettab27_2[symbol][state] = (sym ? (255-symbol):symbol) / 16; + sym = parity((2 * state) & abs(polys[1])) ^ (polys[1] < 0); + Mettab27_2[symbol][state] = (sym ? (255 - symbol) : symbol) / 16; } } Init++; @@ -62,53 +73,59 @@ void set_viterbi27_polynomial_mmx(int polys[2]){ /* Create a new instance of a Viterbi decoder */ -void *create_viterbi27_mmx(int len){ +void *create_viterbi27_mmx(int len) +{ struct v27 *vp; int polys[2] = { V27POLYA, V27POLYB }; - - if(Init == 0){ + + if (Init == 0) { set_viterbi27_polynomial_mmx(polys); } - if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL) + if ((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL) { return NULL; + } - if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){ + if ((vp->decisions = (decision_t *)malloc((len + 6) * sizeof( + decision_t))) == NULL) { free(vp); return NULL; } - init_viterbi27_mmx(vp,0); + init_viterbi27_mmx(vp, 0); return vp; } /* Viterbi chainback */ int chainback_viterbi27_mmx( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ struct v27 *vp = (struct v27 *)p; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; + } d = (decision_t *)vp->decisions; endstate &= 63; d += 6; /* Look past tail */ - while(nbits-- != 0){ + while (nbits-- != 0) { int k; - k = d[nbits].c[endstate>>2] & 1; - data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + k = d[nbits].c[endstate >> 2] & 1; + data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7); } return 0; } /* Delete instance of a Viterbi decoder */ -void delete_viterbi27_mmx(void *p){ +void delete_viterbi27_mmx(void *p) +{ struct v27 *vp = p; - if(vp != NULL){ + if (vp != NULL) { free(vp->decisions); free(vp); } diff --git a/libfec/viterbi27_port.c b/lib/libfec/viterbi27_port.c similarity index 68% rename from libfec/viterbi27_port.c rename to lib/libfec/viterbi27_port.c index 7cac2b3..9f242fe 100644 --- a/libfec/viterbi27_port.c +++ b/lib/libfec/viterbi27_port.c @@ -9,9 +9,15 @@ #include "fec.h" -typedef union { unsigned int w[64]; } metric_t; -typedef union { unsigned long w[2];} decision_t; -static union branchtab27 { unsigned char c[32]; } Branchtab27[2] __attribute__ ((aligned(16))); +typedef union { + unsigned int w[64]; +} metric_t; +typedef union { + unsigned long w[2]; +} decision_t; +static union branchtab27 { + unsigned char c[32]; +} Branchtab27[2] __attribute__((aligned(16))); static int Init = 0; /* State info for instance of Viterbi decoder @@ -21,19 +27,23 @@ struct v27 { metric_t metrics1; /* path metric buffer 1 */ metric_t metrics2; /* path metric buffer 2 */ decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ decision_t *decisions; /* Beginning of decisions for block */ }; /* Initialize Viterbi decoder for start of new frame */ -int init_viterbi27_port(void *p,int starting_state){ +int init_viterbi27_port(void *p, int starting_state) +{ struct v27 *vp = p; int i; - if(p == NULL) + if (p == NULL) { return -1; - for(i=0;i<64;i++) + } + for (i = 0; i < 64; i++) { vp->metrics1.w[i] = 63; + } vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; @@ -42,46 +52,53 @@ int init_viterbi27_port(void *p,int starting_state){ return 0; } -void set_viterbi27_polynomial_port(int polys[2]){ +void set_viterbi27_polynomial_port(int polys[2]) +{ int state; - for(state=0;state < 32;state++){ - Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; - Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + for (state = 0; state < 32; state++) { + Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2 * state) & abs( + polys[0])) ? 255 : 0; + Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2 * state) & abs( + polys[1])) ? 255 : 0; } Init++; } /* Create a new instance of a Viterbi decoder */ -void *create_viterbi27_port(int len){ +void *create_viterbi27_port(int len) +{ struct v27 *vp; - if(!Init){ + if (!Init) { int polys[2] = { V27POLYA, V27POLYB }; set_viterbi27_polynomial_port(polys); } - if((vp = malloc(sizeof(struct v27))) == NULL) - return NULL; - if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){ + if ((vp = malloc(sizeof(struct v27))) == NULL) { + return NULL; + } + if ((vp->decisions = malloc((len + 6) * sizeof(decision_t))) == NULL) { free(vp); return NULL; } - init_viterbi27_port(vp,0); + init_viterbi27_port(vp, 0); return vp; } /* Viterbi chainback */ int chainback_viterbi27_port( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ struct v27 *vp = p; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; + } d = vp->decisions; /* Make room beyond the end of the encoder register so we can * accumulate a full byte of decoded data @@ -94,20 +111,21 @@ int chainback_viterbi27_port( * combine in the cache anyway */ d += 6; /* Look past tail */ - while(nbits-- != 0){ + while (nbits-- != 0) { int k; - k = (d[nbits].w[(endstate>>2)/32] >> ((endstate>>2)%32)) & 1; - data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + k = (d[nbits].w[(endstate >> 2) / 32] >> ((endstate >> 2) % 32)) & 1; + data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7); } return 0; } /* Delete instance of a Viterbi decoder */ -void delete_viterbi27_port(void *p){ +void delete_viterbi27_port(void *p) +{ struct v27 *vp = p; - if(vp != NULL){ + if (vp != NULL) { free(vp->decisions); free(vp); } @@ -133,21 +151,23 @@ unsigned int metric,m0,m1,decision;\ * Note that nbits is the number of decoded data bits, not the number * of symbols! */ -int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits){ +int update_viterbi27_blk_port(void *p, unsigned char *syms, int nbits) +{ struct v27 *vp = p; void *tmp; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; + } d = (decision_t *)vp->dp; - while(nbits--){ - unsigned char sym0,sym1; + while (nbits--) { + unsigned char sym0, sym1; d->w[0] = d->w[1] = 0; sym0 = *syms++; sym1 = *syms++; - + BFLY(0); BFLY(1); BFLY(2); @@ -185,7 +205,7 @@ int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits){ tmp = vp->old_metrics; vp->old_metrics = vp->new_metrics; vp->new_metrics = tmp; - } + } vp->dp = d; return 0; } diff --git a/libfec/viterbi27_sse.c b/lib/libfec/viterbi27_sse.c similarity index 55% rename from libfec/viterbi27_sse.c rename to lib/libfec/viterbi27_sse.c index cd1f287..2d12699 100644 --- a/libfec/viterbi27_sse.c +++ b/lib/libfec/viterbi27_sse.c @@ -7,9 +7,18 @@ #include #include "fec.h" -typedef union { unsigned char c[64]; } metric_t; -typedef union { unsigned long w[2]; unsigned char c[8]; __m64 v[1];} decision_t; -union branchtab27 { unsigned char c[32]; __m64 v[4];} Branchtab27_sse[2]; +typedef union { + unsigned char c[64]; +} metric_t; +typedef union { + unsigned long w[2]; + unsigned char c[8]; + __m64 v[1]; +} decision_t; +union branchtab27 { + unsigned char c[32]; + __m64 v[4]; +} Branchtab27_sse[2]; static int Init = 0; /* State info for instance of Viterbi decoder @@ -19,48 +28,57 @@ struct v27 { metric_t metrics1; /* path metric buffer 1 */ metric_t metrics2; /* path metric buffer 2 */ decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ decision_t *decisions; /* Beginning of decisions for block */ }; /* Create a new instance of a Viterbi decoder */ -void *create_viterbi27_sse(int len){ +void *create_viterbi27_sse(int len) +{ struct v27 *vp; - if(!Init){ + if (!Init) { int polys[2] = { V27POLYA, V27POLYB }; set_viterbi27_polynomial_sse(polys); } - if((vp = malloc(sizeof(struct v27))) == NULL) + if ((vp = malloc(sizeof(struct v27))) == NULL) { return NULL; - if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){ + } + if ((vp->decisions = malloc((len + 6) * sizeof(decision_t))) == NULL) { free(vp); return NULL; } - init_viterbi27(vp,0); + init_viterbi27(vp, 0); return vp; } -void set_viterbi27_polynomial_sse(int polys[2]){ +void set_viterbi27_polynomial_sse(int polys[2]) +{ int state; - for(state=0;state < 32;state++){ - Branchtab27_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; - Branchtab27_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + for (state = 0; state < 32; state++) { + Branchtab27_sse[0].c[state] = (polys[0] < 0) ^ parity((2 * state) & abs( + polys[0])) ? 255 : 0; + Branchtab27_sse[1].c[state] = (polys[1] < 0) ^ parity((2 * state) & abs( + polys[1])) ? 255 : 0; } Init++; } /* Initialize Viterbi decoder for start of new frame */ -int init_viterbi27_sse(void *p,int starting_state){ +int init_viterbi27_sse(void *p, int starting_state) +{ struct v27 *vp = p; int i; - if(p == NULL) + if (p == NULL) { return -1; - for(i=0;i<64;i++) + } + for (i = 0; i < 64; i++) { vp->metrics1.c[i] = 63; + } vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; @@ -71,15 +89,17 @@ int init_viterbi27_sse(void *p,int starting_state){ /* Viterbi chainback */ int chainback_viterbi27_sse( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ struct v27 *vp = p; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; + } d = vp->decisions; /* Make room beyond the end of the encoder register so we can @@ -93,20 +113,21 @@ int chainback_viterbi27_sse( * combine in the cache anyway */ d += 6; /* Look past tail */ - while(nbits-- != 0){ + while (nbits-- != 0) { int k; - k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1; - data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + k = (d[nbits].c[(endstate >> 2) / 8] >> ((endstate >> 2) % 8)) & 1; + data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7); } return 0; } /* Delete instance of a Viterbi decoder */ -void delete_viterbi27_sse(void *p){ +void delete_viterbi27_sse(void *p) +{ struct v27 *vp = p; - if(vp != NULL){ + if (vp != NULL) { free(vp->decisions); free(vp); } diff --git a/lib/libfec/viterbi27_sse2.c b/lib/libfec/viterbi27_sse2.c new file mode 100644 index 0000000..bb82668 --- /dev/null +++ b/lib/libfec/viterbi27_sse2.c @@ -0,0 +1,209 @@ +/* K=7 r=1/2 Viterbi decoder for SSE2 + * Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { + unsigned char c[64]; + __m128i v[4]; +} metric_t; +typedef union { + unsigned long w[2]; + unsigned char c[8]; + unsigned short s[4]; + __m64 v[1]; +} decision_t; +union branchtab27 { + unsigned char c[32]; + __m128i v[2]; +} Branchtab27_sse2[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in sse2bfly27.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_sse2(void *p, int starting_state) +{ + struct v27 *vp = p; + int i; + + if (p == NULL) { + return -1; + } + for (i = 0; i < 64; i++) { + vp->metrics1.c[i] = 63; + } + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_sse2(int polys[2]) +{ + int state; + + for (state = 0; state < 32; state++) { + Branchtab27_sse2[0].c[state] = (polys[0] < 0) ^ parity((2 * state) & abs( + polys[0])) ? 255 : 0; + Branchtab27_sse2[1].c[state] = (polys[1] < 0) ^ parity((2 * state) & abs( + polys[1])) ? 255 : 0; + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_sse2(int len) +{ + void *p; + struct v27 *vp; + + if (!Init) { + int polys[2] = { V27POLYA, V27POLYB }; + set_viterbi27_polynomial_sse2(polys); + } + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if (posix_memalign(&p, sizeof(__m128i), sizeof(struct v27))) { + return NULL; + } + vp = (struct v27 *)p; + + if ((p = malloc((len + 6) * sizeof(decision_t))) == NULL) { + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi27_sse2(vp, 0); + + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + struct v27 *vp = p; + decision_t *d; + + if (p == NULL) { + return -1; + } + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while (nbits-- != 0) { + int k; + + k = (d[nbits].c[(endstate >> 2) / 8] >> ((endstate >> 2) % 8)) & 1; + data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_sse2(void *p) +{ + struct v27 *vp = p; + + if (vp != NULL) { + free(vp->decisions); + free(vp); + } +} + + +#if 0 +/* This code is turned off because it's slower than my hand-crafted assembler in sse2bfly27.s. But it does work. */ +void update_viterbi27_blk_sse2(void *p, unsigned char *syms, int nbits) +{ + struct v27 *vp = p; + decision_t *d; + + if (p == NULL) { + return; + } + d = (decision_t *)vp->dp; + while (nbits--) { + __m128i sym0v, sym1v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_epi8(syms[0]); + sym1v = _mm_set1_epi8(syms[1]); + syms += 2; + + for (i = 0; i < 2; i++) { + __m128i decision0, decision1, metric, m_metric, m0, m1, m2, m3, survivor0, + survivor1; + + /* Form branch metrics */ + metric = _mm_avg_epu8(_mm_xor_si128(Branchtab27_sse2[0].v[i], sym0v), + _mm_xor_si128(Branchtab27_sse2[1].v[i], sym1v)); + /* There's no packed bytes right shift in SSE2, so we use the word version and mask + * (I'm *really* starting to like Altivec...) + */ + metric = _mm_srli_epi16(metric, 3); + metric = _mm_and_si128(metric, _mm_set1_epi8(31)); + m_metric = _mm_sub_epi8(_mm_set1_epi8(31), metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_epi8(vp->old_metrics->v[i], metric); + m3 = _mm_add_epi8(vp->old_metrics->v[2 + i], metric); + m1 = _mm_add_epi8(vp->old_metrics->v[2 + i], m_metric); + m2 = _mm_add_epi8(vp->old_metrics->v[i], m_metric); + + /* Compare and select, using modulo arithmetic */ + decision0 = _mm_cmpgt_epi8(_mm_sub_epi8(m0, m1), _mm_setzero_si128()); + decision1 = _mm_cmpgt_epi8(_mm_sub_epi8(m2, m3), _mm_setzero_si128()); + survivor0 = _mm_or_si128(_mm_and_si128(decision0, m1), + _mm_andnot_si128(decision0, m0)); + survivor1 = _mm_or_si128(_mm_and_si128(decision1, m3), + _mm_andnot_si128(decision1, m2)); + + /* Pack each set of decisions into 16 bits */ + d->s[2 * i] = _mm_movemask_epi8(_mm_unpacklo_epi8(decision0, decision1)); + d->s[2 * i + 1] = _mm_movemask_epi8(_mm_unpackhi_epi8(decision0, decision1)); + + /* Store surviving metrics */ + vp->new_metrics->v[2 * i] = _mm_unpacklo_epi8(survivor0, survivor1); + vp->new_metrics->v[2 * i + 1] = _mm_unpackhi_epi8(survivor0, survivor1); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; +} +#endif diff --git a/lib/libfec/viterbi29.c b/lib/libfec/viterbi29.c new file mode 100644 index 0000000..cd405a9 --- /dev/null +++ b/lib/libfec/viterbi29.c @@ -0,0 +1,184 @@ +/* Switch to K=9 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29(int len) +{ + find_cpu_mode(); + + switch (Cpu_mode) { + case PORT: + default: + return create_viterbi29_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi29_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi29_mmx(len); + case SSE: + return create_viterbi29_sse(len); + case SSE2: + return create_viterbi29_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi29_port(len); +#endif + } +} + +void set_viterbi29_polynomial(int polys[2]) +{ + switch (Cpu_mode) { + case PORT: + default: + set_viterbi29_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi29_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi29_polynomial_mmx(polys); + break; + case SSE: + set_viterbi29_polynomial_sse(polys); + break; + case SSE2: + set_viterbi29_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi29_polynomial_port(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29(void *p, int starting_state) +{ + switch (Cpu_mode) { + case PORT: + default: + return init_viterbi29_port(p, starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi29_av(p, starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi29_mmx(p, starting_state); + case SSE: + return init_viterbi29_sse(p, starting_state); + case SSE2: + return init_viterbi29_sse2(p, starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi29_port(p, starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi29( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + + switch (Cpu_mode) { + case PORT: + default: + return chainback_viterbi29_port(p, data, nbits, endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi29_av(p, data, nbits, endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi29_mmx(p, data, nbits, endstate); + case SSE: + return chainback_viterbi29_sse(p, data, nbits, endstate); + case SSE2: + return chainback_viterbi29_sse2(p, data, nbits, endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi29_port(p, data, nbits, endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29(void *p) +{ + switch (Cpu_mode) { + case PORT: + default: + delete_viterbi29_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi29_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi29_mmx(p); + break; + case SSE: + delete_viterbi29_sse(p); + break; + case SSE2: + delete_viterbi29_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi29_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi29_blk(void *p, unsigned char syms[], int nbits) +{ + switch (Cpu_mode) { + case PORT: + default: + return update_viterbi29_blk_port(p, syms, nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi29_blk_av(p, syms, nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi29_blk_mmx(p, syms, nbits); + case SSE: + return update_viterbi29_blk_sse(p, syms, nbits); + case SSE2: + return update_viterbi29_blk_sse2(p, syms, nbits); +#endif +#ifdef __x86_64__ + case SSE2: + return update_viterbi29_blk_port(p, syms, nbits); +#endif + } +} diff --git a/lib/libfec/viterbi29_av.c b/lib/libfec/viterbi29_av.c new file mode 100644 index 0000000..2ef010e --- /dev/null +++ b/lib/libfec/viterbi29_av.c @@ -0,0 +1,217 @@ +/* K=9 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { + unsigned char c[256]; + vector bool char v[16]; +} decision_t; +typedef union { + unsigned char c[256]; + vector unsigned char v[16]; +} metric_t; + +static union branchtab29 { + unsigned char c[128]; + vector unsigned char v[8]; +} Branchtab29[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_av(void *p, int starting_state) +{ + struct v29 *vp = p; + int i; + + if (p == NULL) { + return -1; + } + for (i = 0; i < 16; i++) { + vp->metrics1.v[i] = (vector unsigned char)(63); + } + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi29_polynomial_av(int polys[2]) +{ + int state; + + for (state = 0; state < 128; state++) { + Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2 * state) & abs( + polys[0])) ? 255 : 0; + Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2 * state) & abs( + polys[1])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_av(int len) +{ + struct v29 *vp; + + if (!Init) { + int polys[2] = { V29POLYA, V29POLYB }; + set_viterbi29_polynomial_av(polys); + } + if ((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) { + return NULL; + } + if ((vp->decisions = (decision_t *)malloc((len + 8) * sizeof( + decision_t))) == NULL) { + free(vp); + return NULL; + } + init_viterbi29_av(vp, 0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi29_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + struct v29 *vp = p; + decision_t *d; + + if (p == NULL) { + return -1; + } + d = (decision_t *)vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while (nbits-- != 0) { + int k; + + k = d[nbits].c[endstate] & 1; + data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_av(void *p) +{ + struct v29 *vp = p; + + if (vp != NULL) { + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi29_blk_av(void *p, unsigned char *syms, int nbits) +{ + struct v29 *vp = p; + decision_t *d; + int i; + + if (p == NULL) { + return -1; + } + d = (decision_t *)vp->dp; + + while (nbits--) { + vector unsigned char sym1v, sym2v; + void *tmp; + + /* All this seems necessary just to load a byte into all elements of a vector! */ + sym1v = vec_perm(vec_ld(0, syms), vec_ld(1, syms), vec_lvsl(0, + syms)); /* sym1v.0 = syms[0]; sym1v.1 = syms[1] */ + sym2v = vec_splat(sym1v, 1); /* Splat syms[1] across sym2v */ + sym1v = vec_splat(sym1v, 0); /* Splat syms[0] across sym1v */ + syms += 2; + + for (i = 0; i < 8; i++) { + vector bool char decision0, decision1; + vector unsigned char metric, m_metric, m0, m1, m2, m3, survivor0, survivor1; + + /* Form branch metrics */ + metric = vec_avg(vec_xor(Branchtab29[0].v[i], sym1v), + vec_xor(Branchtab29[1].v[i], sym2v)); + metric = vec_sr(metric, (vector unsigned char)(3)); + m_metric = (vector unsigned char)(31) - metric; + + /* Add branch metrics to path metrics */ + m0 = vec_adds(vp->old_metrics->v[i], metric); + m3 = vec_adds(vp->old_metrics->v[8 + i], metric); + m1 = vec_adds(vp->old_metrics->v[8 + i], m_metric); + m2 = vec_adds(vp->old_metrics->v[i], m_metric); + + /* Compare and select first set */ + decision0 = vec_cmpgt(m0, m1); + decision1 = vec_cmpgt(m2, m3); + survivor0 = vec_min(m0, m1); + survivor1 = vec_min(m2, m3); + + /* Interleave and store decisions and survivors */ + d->v[2 * i] = vec_mergeh(decision0, decision1); + d->v[2 * i + 1] = vec_mergel(decision0, decision1); + vp->new_metrics->v[2 * i] = vec_mergeh(survivor0, survivor1); + vp->new_metrics->v[2 * i + 1] = vec_mergel(survivor0, survivor1); + } + d++; + /* renormalize if necessary */ + if (vp->new_metrics->c[0] >= 50) { + int i; + vector unsigned char scale0, scale1; + + /* Find smallest metric and splat */ + scale0 = vp->new_metrics->v[0]; + scale1 = vp->new_metrics->v[1]; + for (i = 2; i < 16; i += 2) { + scale0 = vec_min(scale0, vp->new_metrics->v[i]); + scale1 = vec_min(scale1, vp->new_metrics->v[i + 1]); + } + scale0 = vec_min(scale0, scale1); + scale0 = vec_min(scale0, vec_sld(scale0, scale0, 8)); + scale0 = vec_min(scale0, vec_sld(scale0, scale0, 4)); + scale0 = vec_min(scale0, vec_sld(scale0, scale0, 2)); + scale0 = vec_min(scale0, vec_sld(scale0, scale0, 1)); + + /* Now subtract from all metrics */ + for (i = 0; i < 16; i++) { + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i], scale0); + } + } + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/libfec/viterbi29_mmx.c b/lib/libfec/viterbi29_mmx.c similarity index 51% rename from libfec/viterbi29_mmx.c rename to lib/libfec/viterbi29_mmx.c index 563f40a..67269b4 100644 --- a/libfec/viterbi29_mmx.c +++ b/lib/libfec/viterbi29_mmx.c @@ -8,11 +8,17 @@ #include #include "fec.h" -typedef union { char c[256]; __m64 v[32];} decision_t; -typedef union { unsigned char c[256]; __m64 v[32];} metric_t; +typedef union { + char c[256]; + __m64 v[32]; +} decision_t; +typedef union { + unsigned char c[256]; + __m64 v[32]; +} metric_t; -unsigned char Mettab29_1[256][128] __attribute__ ((aligned(8))); -unsigned char Mettab29_2[256][128] __attribute__ ((aligned(8))); +unsigned char Mettab29_1[256][128] __attribute__((aligned(8))); +unsigned char Mettab29_2[256][128] __attribute__((aligned(8))); static int Init = 0; /* State info for instance of Viterbi decoder @@ -22,58 +28,66 @@ struct v29 { metric_t metrics1; /* path metric buffer 1 */ metric_t metrics2; /* path metric buffer 2 */ decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ decision_t *decisions; /* Beginning of decisions for block */ }; /* Create a new instance of a Viterbi decoder */ -void *create_viterbi29_mmx(int len){ +void *create_viterbi29_mmx(int len) +{ struct v29 *vp; - if(Init == 0){ - int polys[2] = {V29POLYA,V29POLYB}; + if (Init == 0) { + int polys[2] = {V29POLYA, V29POLYB}; set_viterbi29_polynomial_mmx(polys); } - if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + if ((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) { return NULL; + } - if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + if ((vp->decisions = (decision_t *)malloc((len + 8) * sizeof( + decision_t))) == NULL) { free(vp); return NULL; } - init_viterbi29(vp,0); + init_viterbi29(vp, 0); return vp; } -void set_viterbi29_polynomial_mmx(int polys[2]){ +void set_viterbi29_polynomial_mmx(int polys[2]) +{ int state; - for(state=0;state < 128;state++){ + for (state = 0; state < 128; state++) { int symbol; - for(symbol = 0;symbol < 256;symbol++){ + for (symbol = 0; symbol < 256; symbol++) { int sym; - sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0); - Mettab29_1[symbol][state] = (sym ? (255-symbol):symbol) / 16; + sym = parity((2 * state) & abs(polys[0])) ^ (polys[0] < 0); + Mettab29_1[symbol][state] = (sym ? (255 - symbol) : symbol) / 16; - sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0); - Mettab29_2[symbol][state] = (sym ? (255-symbol):symbol) / 16; + sym = parity((2 * state) & abs(polys[1])) ^ (polys[1] < 0); + Mettab29_2[symbol][state] = (sym ? (255 - symbol) : symbol) / 16; } } Init++; } /* Initialize Viterbi decoder for start of new frame */ -int init_viterbi29_mmx(void *p,int starting_state){ +int init_viterbi29_mmx(void *p, int starting_state) +{ struct v29 *vp = p; int i; - if(p == NULL) + if (p == NULL) { return -1; - for(i=0;i<256;i++) + } + for (i = 0; i < 256; i++) { vp->metrics1.c[i] = 63; + } vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; @@ -84,34 +98,37 @@ int init_viterbi29_mmx(void *p,int starting_state){ /* Viterbi chainback */ int chainback_viterbi29_mmx( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ struct v29 *vp = (struct v29 *)p; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; + } d = (decision_t *)vp->decisions; endstate &= 255; d += 8; /* Look past tail */ - while(nbits-- != 0){ + while (nbits-- != 0) { int k; k = d[nbits].c[endstate] & 1; - data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7); } return 0; } /* Delete instance of a Viterbi decoder */ -void delete_viterbi29_mmx(void *p){ +void delete_viterbi29_mmx(void *p) +{ struct v29 *vp = p; - if(vp != NULL){ + if (vp != NULL) { free(vp->decisions); free(vp); } diff --git a/libfec/viterbi29_port.c b/lib/libfec/viterbi29_port.c similarity index 62% rename from libfec/viterbi29_port.c rename to lib/libfec/viterbi29_port.c index 292dce8..b9ab118 100644 --- a/libfec/viterbi29_port.c +++ b/lib/libfec/viterbi29_port.c @@ -7,10 +7,16 @@ #include #include "fec.h" -typedef union { unsigned int w[256]; } metric_t; -typedef union { unsigned long w[8];} decision_t; +typedef union { + unsigned int w[256]; +} metric_t; +typedef union { + unsigned long w[8]; +} decision_t; -static union { unsigned char c[128]; } Branchtab29[2]; +static union { + unsigned char c[128]; +} Branchtab29[2]; static int Init = 0; /* State info for instance of Viterbi decoder */ @@ -18,19 +24,23 @@ struct v29 { metric_t metrics1; /* path metric buffer 1 */ metric_t metrics2; /* path metric buffer 2 */ decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ decision_t *decisions; /* Beginning of decisions for block */ }; /* Initialize Viterbi decoder for start of new frame */ -int init_viterbi29_port(void *p,int starting_state){ +int init_viterbi29_port(void *p, int starting_state) +{ struct v29 *vp = p; int i; - if(p == NULL) + if (p == NULL) { return -1; - for(i=0;i<256;i++) + } + for (i = 0; i < 256; i++) { vp->metrics1.w[i] = 63; + } vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; @@ -39,33 +49,39 @@ int init_viterbi29_port(void *p,int starting_state){ return 0; } -void set_viterbi29_polynomial_port(int polys[2]){ +void set_viterbi29_polynomial_port(int polys[2]) +{ int state; - for(state=0;state < 128;state++){ - Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; - Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + for (state = 0; state < 128; state++) { + Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2 * state) & abs( + polys[0])) ? 255 : 0; + Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2 * state) & abs( + polys[1])) ? 255 : 0; } Init++; } /* Create a new instance of a Viterbi decoder */ -void *create_viterbi29_port(int len){ +void *create_viterbi29_port(int len) +{ struct v29 *vp; - if(!Init){ - int polys[2] = {V29POLYA,V29POLYB}; + if (!Init) { + int polys[2] = {V29POLYA, V29POLYB}; set_viterbi29_polynomial_port(polys); } - if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + if ((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) { return NULL; + } - if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + if ((vp->decisions = (decision_t *)malloc((len + 8) * sizeof( + decision_t))) == NULL) { free(vp); return NULL; } - init_viterbi29_port(vp,0); + init_viterbi29_port(vp, 0); return vp; } @@ -73,15 +89,17 @@ void *create_viterbi29_port(int len){ /* Viterbi chainback */ int chainback_viterbi29_port( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ struct v29 *vp = p; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; + } d = vp->decisions; /* Make room beyond the end of the encoder register so we can @@ -94,21 +112,22 @@ int chainback_viterbi29_port( * combine in the cache anyway */ d += 8; /* Look past tail */ - while(nbits-- != 0){ + while (nbits-- != 0) { int k; - - k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1; - data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + + k = (d[nbits].w[(endstate) / 32] >> (endstate % 32)) & 1; + data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7); } return 0; } /* Delete instance of a Viterbi decoder */ -void delete_viterbi29_port(void *p){ +void delete_viterbi29_port(void *p) +{ struct v29 *vp = p; - if(vp != NULL){ + if (vp != NULL) { free(vp->decisions); free(vp); } @@ -135,32 +154,36 @@ unsigned int metric,m0,m1,decision;\ * of symbols! */ -int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits){ +int update_viterbi29_blk_port(void *p, unsigned char *syms, int nbits) +{ struct v29 *vp = p; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; - + } + d = (decision_t *)vp->dp; - while(nbits--){ + while (nbits--) { void *tmp; - unsigned char sym0,sym1; + unsigned char sym0, sym1; int i; - for(i=0;i<8;i++) + for (i = 0; i < 8; i++) { d->w[i] = 0; + } sym0 = *syms++; sym1 = *syms++; - - for(i=0;i<128;i++) + + for (i = 0; i < 128; i++) { BFLY(i); + } d++; tmp = vp->old_metrics; vp->old_metrics = vp->new_metrics; vp->new_metrics = tmp; - } + } vp->dp = d; return 0; } diff --git a/libfec/viterbi29_sse.c b/lib/libfec/viterbi29_sse.c similarity index 54% rename from libfec/viterbi29_sse.c rename to lib/libfec/viterbi29_sse.c index 4a92e5f..27f0a50 100644 --- a/libfec/viterbi29_sse.c +++ b/lib/libfec/viterbi29_sse.c @@ -8,10 +8,19 @@ #include #include "fec.h" -typedef union { unsigned char w[256]; __m64 v[32];} metric_t; -typedef union { unsigned long w[8]; unsigned char c[32]; __m64 v[4];} decision_t; +typedef union { + unsigned char w[256]; + __m64 v[32]; +} metric_t; +typedef union { + unsigned long w[8]; + unsigned char c[32]; + __m64 v[4]; +} decision_t; -union branchtab29 { unsigned char c[128]; } Branchtab29_sse[2]; +union branchtab29 { + unsigned char c[128]; +} Branchtab29_sse[2]; static int Init = 0; /* State info for instance of Viterbi decoder @@ -21,48 +30,58 @@ struct v29 { metric_t metrics1; /* path metric buffer 1 */ metric_t metrics2; /* path metric buffer 2 */ decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ decision_t *decisions; /* Beginning of decisions for block */ }; /* Create a new instance of a Viterbi decoder */ -void *create_viterbi29_sse(int len){ +void *create_viterbi29_sse(int len) +{ struct v29 *vp; - if(!Init){ - int polys[2] = { V29POLYA,V29POLYB }; + if (!Init) { + int polys[2] = { V29POLYA, V29POLYB }; set_viterbi29_polynomial_sse(polys); } - if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + if ((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) { return NULL; - if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + } + if ((vp->decisions = (decision_t *)malloc((len + 8) * sizeof( + decision_t))) == NULL) { free(vp); return NULL; } - init_viterbi29(vp,0); + init_viterbi29(vp, 0); return vp; } -void set_viterbi29_polynomial_sse(int polys[2]){ +void set_viterbi29_polynomial_sse(int polys[2]) +{ int state; - for(state=0;state < 128;state++){ - Branchtab29_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; - Branchtab29_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + for (state = 0; state < 128; state++) { + Branchtab29_sse[0].c[state] = (polys[0] < 0) ^ parity((2 * state) & abs( + polys[0])) ? 255 : 0; + Branchtab29_sse[1].c[state] = (polys[1] < 0) ^ parity((2 * state) & abs( + polys[1])) ? 255 : 0; } Init++; } /* Initialize Viterbi decoder for start of new frame */ -int init_viterbi29_sse(void *p,int starting_state){ +int init_viterbi29_sse(void *p, int starting_state) +{ struct v29 *vp = p; int i; - if(p == NULL) + if (p == NULL) { return -1; - for(i=0;i<256;i++) + } + for (i = 0; i < 256; i++) { vp->metrics1.w[i] = 200; + } vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; @@ -73,15 +92,17 @@ int init_viterbi29_sse(void *p,int starting_state){ /* Viterbi chainback */ int chainback_viterbi29_sse( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ struct v29 *vp = p; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; + } d = vp->decisions; /* Make room beyond the end of the encoder register so we can * accumulate a full byte of decoded data @@ -93,21 +114,22 @@ int chainback_viterbi29_sse( * combine in the cache anyway */ d += 8; /* Look past tail */ - while(nbits-- != 0){ + while (nbits-- != 0) { int k; - - k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; - data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + + k = (d[nbits].c[endstate / 8] >> (endstate % 8)) & 1; + data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7); } return 0; } /* Delete instance of a Viterbi decoder */ -void delete_viterbi29_sse(void *p){ +void delete_viterbi29_sse(void *p) +{ struct v29 *vp = p; - if(vp != NULL){ + if (vp != NULL) { free(vp->decisions); free(vp); } diff --git a/libfec/viterbi29_sse2.c b/lib/libfec/viterbi29_sse2.c similarity index 57% rename from libfec/viterbi29_sse2.c rename to lib/libfec/viterbi29_sse2.c index 4c7336c..f23c752 100644 --- a/libfec/viterbi29_sse2.c +++ b/lib/libfec/viterbi29_sse2.c @@ -8,10 +8,18 @@ #include #include "fec.h" -typedef union { unsigned char c[256]; __m128i v[16];} metric_t; -typedef union { unsigned long w[8]; unsigned char c[32];} decision_t; +typedef union { + unsigned char c[256]; + __m128i v[16]; +} metric_t; +typedef union { + unsigned long w[8]; + unsigned char c[32]; +} decision_t; -union branchtab29 { unsigned char c[128]; } Branchtab29_sse2[2]; +union branchtab29 { + unsigned char c[128]; +} Branchtab29_sse2[2]; static int Init = 0; /* State info for instance of Viterbi decoder @@ -21,17 +29,20 @@ struct v29 { metric_t metrics1; /* path metric buffer 1 */ metric_t metrics2; /* path metric buffer 2 */ decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ decision_t *decisions; /* Beginning of decisions for block */ }; /* Initialize Viterbi decoder for start of new frame */ -int init_viterbi29_sse2(void *p,int starting_state){ +int init_viterbi29_sse2(void *p, int starting_state) +{ struct v29 *vp = p; int i; - for(i=0;i<256;i++) + for (i = 0; i < 256; i++) { vp->metrics1.c[i] = 63; + } vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; @@ -40,52 +51,59 @@ int init_viterbi29_sse2(void *p,int starting_state){ return 0; } -void set_viterbi29_polynomial_sse2(int polys[2]){ +void set_viterbi29_polynomial_sse2(int polys[2]) +{ int state; - for(state=0;state < 128;state++){ - Branchtab29_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; - Branchtab29_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + for (state = 0; state < 128; state++) { + Branchtab29_sse2[0].c[state] = (polys[0] < 0) ^ parity((2 * state) & abs( + polys[0])) ? 255 : 0; + Branchtab29_sse2[1].c[state] = (polys[1] < 0) ^ parity((2 * state) & abs( + polys[1])) ? 255 : 0; } Init++; } /* Create a new instance of a Viterbi decoder */ -void *create_viterbi29_sse2(int len){ +void *create_viterbi29_sse2(int len) +{ void *p; struct v29 *vp; - if(!Init){ - int polys[2] = {V29POLYA,V29POLYB}; + if (!Init) { + int polys[2] = {V29POLYA, V29POLYB}; set_viterbi29_polynomial(polys); } /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ - if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v29))) + if (posix_memalign(&p, sizeof(__m128i), sizeof(struct v29))) { return NULL; + } vp = (struct v29 *)p; - if((p = malloc((len+8)*sizeof(decision_t))) == NULL){ + if ((p = malloc((len + 8) * sizeof(decision_t))) == NULL) { free(vp); return NULL; } vp->decisions = (decision_t *)p; - init_viterbi29_sse2(vp,0); + init_viterbi29_sse2(vp, 0); return vp; } /* Viterbi chainback */ int chainback_viterbi29_sse2( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ struct v29 *vp = p; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; + } d = vp->decisions; /* Make room beyond the end of the encoder register so we can @@ -98,21 +116,22 @@ int chainback_viterbi29_sse2( * combine in the cache anyway */ d += 8; /* Look past tail */ - while(nbits-- != 0){ + while (nbits-- != 0) { int k; - - k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; - data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + + k = (d[nbits].c[endstate / 8] >> (endstate % 8)) & 1; + data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7); } return 0; } /* Delete instance of a Viterbi decoder */ -void delete_viterbi29_sse2(void *p){ +void delete_viterbi29_sse2(void *p) +{ struct v29 *vp = p; - if(vp != NULL){ + if (vp != NULL) { free(vp->decisions); free(vp); } diff --git a/lib/libfec/viterbi39.c b/lib/libfec/viterbi39.c new file mode 100644 index 0000000..d798e25 --- /dev/null +++ b/lib/libfec/viterbi39.c @@ -0,0 +1,185 @@ +/* Switch to K=9 r=1/3 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Aug 2006, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39(int len) +{ + find_cpu_mode(); + + switch (Cpu_mode) { + case PORT: + default: + return create_viterbi39_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi39_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi39_mmx(len); + case SSE: + return create_viterbi39_sse(len); + case SSE2: + return create_viterbi39_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi39_port(len); +#endif + } +} + +void set_viterbi39_polynomial(int polys[3]) +{ + switch (Cpu_mode) { + case PORT: + default: + set_viterbi39_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi39_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi39_polynomial_mmx(polys); + break; + case SSE: + set_viterbi39_polynomial_sse(polys); + break; + case SSE2: + set_viterbi39_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi39_polynomial_port(polys); + break; +#endif + } +} + + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39(void *p, int starting_state) +{ + switch (Cpu_mode) { + case PORT: + default: + return init_viterbi39_port(p, starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi39_av(p, starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi39_mmx(p, starting_state); + case SSE: + return init_viterbi39_sse(p, starting_state); + case SSE2: + return init_viterbi39_sse2(p, starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi39_port(p, starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi39( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + + switch (Cpu_mode) { + case PORT: + default: + return chainback_viterbi39_port(p, data, nbits, endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi39_av(p, data, nbits, endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi39_mmx(p, data, nbits, endstate); + case SSE: + return chainback_viterbi39_sse(p, data, nbits, endstate); + case SSE2: + return chainback_viterbi39_sse2(p, data, nbits, endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi39_port(p, data, nbits, endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39(void *p) +{ + switch (Cpu_mode) { + case PORT: + default: + delete_viterbi39_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi39_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi39_mmx(p); + break; + case SSE: + delete_viterbi39_sse(p); + break; + case SSE2: + delete_viterbi39_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi39_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi39_blk(void *p, unsigned char syms[], int nbits) +{ + switch (Cpu_mode) { + case PORT: + default: + return update_viterbi39_blk_port(p, syms, nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi39_blk_av(p, syms, nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi39_blk_mmx(p, syms, nbits); + case SSE: + return update_viterbi39_blk_sse(p, syms, nbits); + case SSE2: + return update_viterbi39_blk_sse2(p, syms, nbits); +#endif +#ifdef __x86_64__ + case SSE2: + return update_viterbi39_blk_port(p, syms, nbits); +#endif + } +} diff --git a/libfec/viterbi39_av.c b/lib/libfec/viterbi39_av.c similarity index 50% rename from libfec/viterbi39_av.c rename to lib/libfec/viterbi39_av.c index 2deed51..00f09a4 100644 --- a/libfec/viterbi39_av.c +++ b/lib/libfec/viterbi39_av.c @@ -9,10 +9,19 @@ #include #include "fec.h" -typedef union { unsigned char c[2][16]; vector unsigned char v[2]; } decision_t; -typedef union { unsigned short s[256]; vector unsigned short v[32]; } metric_t; +typedef union { + unsigned char c[2][16]; + vector unsigned char v[2]; +} decision_t; +typedef union { + unsigned short s[256]; + vector unsigned short v[32]; +} metric_t; -static union branchtab39 { unsigned short s[128]; vector unsigned short v[16];} Branchtab39[3]; +static union branchtab39 { + unsigned short s[128]; + vector unsigned short v[16]; +} Branchtab39[3]; static int Init = 0; /* State info for instance of Viterbi decoder */ @@ -20,17 +29,20 @@ struct v39 { metric_t metrics1; /* path metric buffer 1 */ metric_t metrics2; /* path metric buffer 2 */ void *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ void *decisions; /* Beginning of decisions for block */ }; /* Initialize Viterbi decoder for start of new frame */ -int init_viterbi39_av(void *p,int starting_state){ +int init_viterbi39_av(void *p, int starting_state) +{ struct v39 *vp = p; int i; - for(i=0;i<32;i++) + for (i = 0; i < 32; i++) { vp->metrics1.v[i] = (vector unsigned short)(1000); + } vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; @@ -39,38 +51,44 @@ int init_viterbi39_av(void *p,int starting_state){ return 0; } -void set_viterbi39_polynomial_av(int polys[3]){ +void set_viterbi39_polynomial_av(int polys[3]) +{ int state; - for(state=0;state < 128;state++){ - Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; - Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; - Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0; + for (state = 0; state < 128; state++) { + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2 * state) & abs( + polys[0])) ? 255 : 0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2 * state) & abs( + polys[1])) ? 255 : 0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2 * state) & abs( + polys[2])) ? 255 : 0; } Init++; } /* Create a new instance of a Viterbi decoder */ -void *create_viterbi39_av(int len){ +void *create_viterbi39_av(int len) +{ struct v39 *vp; - if(!Init){ + if (!Init) { int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; set_viterbi39_polynomial_av(polys); } vp = (struct v39 *)malloc(sizeof(struct v39)); - vp->decisions = malloc(sizeof(decision_t)*(len+8)); - init_viterbi39_av(vp,0); + vp->decisions = malloc(sizeof(decision_t) * (len + 8)); + init_viterbi39_av(vp, 0); return vp; } /* Viterbi chainback */ int chainback_viterbi39_av( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ struct v39 *vp = p; decision_t *d = (decision_t *)vp->decisions; int path_metric; @@ -87,90 +105,100 @@ int chainback_viterbi39_av( * combine in the cache anyway */ d += 8; /* Look past tail */ - while(nbits-- != 0){ + while (nbits-- != 0) { int k; - - k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0; + + k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> (( + endstate >> 4) & 7))) ? 1 : 0; endstate = (k << 7) | (endstate >> 1); - data[nbits>>3] = endstate; + data[nbits >> 3] = endstate; } return path_metric; } /* Delete instance of a Viterbi decoder */ -void delete_viterbi39_av(void *p){ +void delete_viterbi39_av(void *p) +{ struct v39 *vp = p; - if(vp != NULL){ + if (vp != NULL) { free(vp->decisions); free(vp); } } -int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits){ +int update_viterbi39_blk_av(void *p, unsigned char *syms, int nbits) +{ struct v39 *vp = p; decision_t *d = (decision_t *)vp->dp; int path_metric = 0; vector unsigned char decisions = (vector unsigned char)(0); - while(nbits--){ - vector unsigned short symv,sym0v,sym1v,sym2v; + while (nbits--) { + vector unsigned short symv, sym0v, sym1v, sym2v; vector unsigned char s; void *tmp; int i; - - /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ - s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms)); - symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s); /* Unsigned byte->word unpack */ - sym0v = vec_splat(symv,0); - sym1v = vec_splat(symv,1); - sym2v = vec_splat(symv,2); + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + s = (vector unsigned char)vec_perm(vec_ld(0, syms), vec_ld(5, syms), vec_lvsl(0, + syms)); + + symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0), + s); /* Unsigned byte->word unpack */ + sym0v = vec_splat(symv, 0); + sym1v = vec_splat(symv, 1); + sym2v = vec_splat(symv, 2); syms += 3; - - for(i=0;i<16;i++){ - vector bool short decision0,decision1; - vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + for (i = 0; i < 16; i++) { + vector bool short decision0, decision1; + vector unsigned short metric, m_metric, m0, m1, m2, m3, survivor0, survivor1; /* Form branch metrics * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, * the XOR operations constitute conditional negation. * the metrics are in the range 0-765 */ - m0 = vec_add(vec_xor(Branchtab39[0].v[i],sym0v),vec_xor(Branchtab39[1].v[i],sym1v)); - m1 = vec_xor(Branchtab39[2].v[i],sym2v); - metric = vec_add(m0,m1); - m_metric = vec_sub((vector unsigned short)(765),metric); - + m0 = vec_add(vec_xor(Branchtab39[0].v[i], sym0v), vec_xor(Branchtab39[1].v[i], + sym1v)); + m1 = vec_xor(Branchtab39[2].v[i], sym2v); + metric = vec_add(m0, m1); + m_metric = vec_sub((vector unsigned short)(765), metric); + /* Add branch metrics to path metrics */ - m0 = vec_adds(vp->old_metrics->v[i],metric); - m3 = vec_adds(vp->old_metrics->v[16+i],metric); - m1 = vec_adds(vp->old_metrics->v[16+i],m_metric); - m2 = vec_adds(vp->old_metrics->v[i],m_metric); - + m0 = vec_adds(vp->old_metrics->v[i], metric); + m3 = vec_adds(vp->old_metrics->v[16 + i], metric); + m1 = vec_adds(vp->old_metrics->v[16 + i], m_metric); + m2 = vec_adds(vp->old_metrics->v[i], m_metric); + /* Compare and select */ - decision0 = vec_cmpgt(m0,m1); - decision1 = vec_cmpgt(m2,m3); - survivor0 = vec_min(m0,m1); - survivor1 = vec_min(m2,m3); - + decision0 = vec_cmpgt(m0, m1); + decision1 = vec_cmpgt(m2, m3); + survivor0 = vec_min(m0, m1); + survivor1 = vec_min(m2, m3); + /* Store decisions and survivors. * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in * a funny interleaved fashion that we undo in the chainback function. */ - decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */ + decisions = vec_add(decisions, + decisions); /* Shift each byte 1 bit to the left */ /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting * 0xff is equivalent to adding 1, which sets the lsb. */ - decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1))); + decisions = vec_sub(decisions, + (vector unsigned char)vec_pack(vec_mergeh(decision0, decision1), + vec_mergel(decision0, decision1))); - vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); - vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); + vp->new_metrics->v[2 * i] = vec_mergeh(survivor0, survivor1); + vp->new_metrics->v[2 * i + 1] = vec_mergel(survivor0, survivor1); - if((i % 8) == 7){ - /* We've accumulated a total of 128 decisions, stash and start again */ - d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */ + if ((i % 8) == 7) { + /* We've accumulated a total of 128 decisions, stash and start again */ + d->v[i >> 3] = + decisions; /* No need to clear, the new bits will replace the old */ } } #if 0 @@ -181,29 +209,32 @@ int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits){ int i; vector unsigned short min_metric; vector unsigned short max_metric; - union { vector unsigned short v; unsigned short s[8];} t; - int minimum,maximum; + union { + vector unsigned short v; + unsigned short s[8]; + } t; + int minimum, maximum; static int max_spread = 0; min_metric = max_metric = vp->new_metrics->v[0]; - for(i=1;i<32;i++){ - min_metric = vec_min(min_metric,vp->new_metrics->v[i]); - max_metric = vec_max(max_metric,vp->new_metrics->v[i]); + for (i = 1; i < 32; i++) { + min_metric = vec_min(min_metric, vp->new_metrics->v[i]); + max_metric = vec_max(max_metric, vp->new_metrics->v[i]); } - min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8)); - max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8)); - min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4)); - max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4)); - min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2)); - max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2)); + min_metric = vec_min(min_metric, vec_sld(min_metric, min_metric, 8)); + max_metric = vec_max(max_metric, vec_sld(max_metric, max_metric, 8)); + min_metric = vec_min(min_metric, vec_sld(min_metric, min_metric, 4)); + max_metric = vec_max(max_metric, vec_sld(max_metric, max_metric, 4)); + min_metric = vec_min(min_metric, vec_sld(min_metric, min_metric, 2)); + max_metric = vec_max(max_metric, vec_sld(max_metric, max_metric, 2)); t.v = min_metric; minimum = t.s[0]; t.v = max_metric; maximum = t.s[0]; - if(maximum-minimum > max_spread){ - max_spread = maximum-minimum; - printf("metric spread = %d\n",max_spread); + if (maximum - minimum > max_spread) { + max_spread = maximum - minimum; + printf("metric spread = %d\n", max_spread); } } #endif @@ -219,24 +250,29 @@ int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits){ * All this is borne out by experiment. */ - if(vp->new_metrics->s[0] >= USHRT_MAX-5000){ + if (vp->new_metrics->s[0] >= USHRT_MAX - 5000) { vector unsigned short scale; - union { vector unsigned short v; unsigned short s[8];} t; - + union { + vector unsigned short v; + unsigned short s[8]; + } t; + /* Find smallest metric and splat */ scale = vp->new_metrics->v[0]; - for(i=1;i<32;i++) - scale = vec_min(scale,vp->new_metrics->v[i]); + for (i = 1; i < 32; i++) { + scale = vec_min(scale, vp->new_metrics->v[i]); + } - scale = vec_min(scale,vec_sld(scale,scale,8)); - scale = vec_min(scale,vec_sld(scale,scale,4)); - scale = vec_min(scale,vec_sld(scale,scale,2)); + scale = vec_min(scale, vec_sld(scale, scale, 8)); + scale = vec_min(scale, vec_sld(scale, scale, 4)); + scale = vec_min(scale, vec_sld(scale, scale, 2)); /* Subtract it from all metrics * Work backwards to try to improve the cache hit ratio, assuming LRU */ - for(i=31;i>=0;i--) - vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale); + for (i = 31; i >= 0; i--) { + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i], scale); + } t.v = scale; path_metric += t.s[0]; } diff --git a/lib/libfec/viterbi39_mmx.c b/lib/libfec/viterbi39_mmx.c new file mode 100644 index 0000000..834f0ba --- /dev/null +++ b/lib/libfec/viterbi39_mmx.c @@ -0,0 +1,215 @@ +/* K=9 r=1/3 Viterbi decoder for x86 MMX + * Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { + unsigned char c[256]; + __m64 v[32]; +} decision_t; +typedef union { + unsigned short s[256]; + __m64 v[64]; +} metric_t; + +static union branchtab39 { + unsigned short s[128]; + __m64 v[32]; +} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_mmx(void *p, int starting_state) +{ + struct v39 *vp = p; + int i; + + if (p == NULL) { + return -1; + } + for (i = 0; i < 256; i++) { + vp->metrics1.s[i] = 1000; + } + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi39_polynomial_mmx(int polys[3]) +{ + int state; + + for (state = 0; state < 128; state++) { + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2 * state) & polys[0]) ? 255 + : 0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2 * state) & polys[1]) ? 255 + : 0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2 * state) & polys[2]) ? 255 + : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_mmx(int len) +{ + struct v39 *vp; + + if (!Init) { + int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; + set_viterbi39_polynomial_mmx(polys); + } + if ((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL) { + return NULL; + } + if ((vp->decisions = malloc((len + 8) * sizeof(decision_t))) == NULL) { + free(vp); + return NULL; + } + init_viterbi39_mmx(vp, 0); + return vp; +} + + + +/* Viterbi chainback */ +int chainback_viterbi39_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + struct v39 *vp = p; + decision_t *d; + int path_metric; + + if (p == NULL) { + return -1; + } + + d = (decision_t *)vp->decisions; + + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while (nbits-- != 0) { + int k; + + k = d[nbits].c[endstate] & 1; + endstate = (k << 7) | (endstate >> 1); + data[nbits >> 3] = endstate; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_mmx(void *p) +{ + struct v39 *vp = p; + + if (vp != NULL) { + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi39_blk_mmx(void *p, unsigned char *syms, int nbits) +{ + struct v39 *vp = p; + decision_t *d; + int path_metric = 0; + + if (p == NULL) { + return -1; + } + + d = (decision_t *)vp->dp; + + while (nbits--) { + __m64 sym0v, sym1v, sym2v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + syms += 3; + + for (i = 0; i < 32; i++) { + __m64 decision0, decision1, metric, m_metric, m0, m1, m2, m3, survivor0, + survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i], sym0v), + _mm_xor_si64(Branchtab39[1].v[i], sym1v)); + metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i], sym2v), m0); + m_metric = _mm_sub_pi16(_mm_set1_pi16(765), metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_pi16(vp->old_metrics->v[i], metric); + m3 = _mm_add_pi16(vp->old_metrics->v[32 + i], metric); + m1 = _mm_add_pi16(vp->old_metrics->v[32 + i], m_metric); + m2 = _mm_add_pi16(vp->old_metrics->v[i], m_metric); + + /* Compare and select + * There's no packed min instruction in MMX, so we use modulo arithmetic + * to form the decisions and then do the select the hard way + */ + decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0, m1), _mm_setzero_si64()); + decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2, m3), _mm_setzero_si64()); + survivor0 = _mm_or_si64(_mm_and_si64(decision0, m1), _mm_andnot_si64(decision0, + m0)); + survivor1 = _mm_or_si64(_mm_and_si64(decision1, m3), _mm_andnot_si64(decision1, + m2)); + + /* Merge decisions and store as bytes */ + d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0, _mm_setzero_si64()), + _mm_packs_pi16(decision1, _mm_setzero_si64())); + + /* Store surviving metrics */ + vp->new_metrics->v[2 * i] = _mm_unpacklo_pi16(survivor0, survivor1); + vp->new_metrics->v[2 * i + 1] = _mm_unpackhi_pi16(survivor0, survivor1); + } + if (vp->new_metrics->s[0] < vp->old_metrics->s[0]) { + path_metric += 65536; /* Hack: wraparound probably occured */ + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return path_metric; +} diff --git a/libfec/viterbi39_port.c b/lib/libfec/viterbi39_port.c similarity index 61% rename from libfec/viterbi39_port.c rename to lib/libfec/viterbi39_port.c index 5685c90..13cfa71 100644 --- a/libfec/viterbi39_port.c +++ b/lib/libfec/viterbi39_port.c @@ -7,10 +7,16 @@ #include #include "fec.h" -typedef union { unsigned int w[256]; } metric_t; -typedef union { unsigned long w[8];} decision_t; +typedef union { + unsigned int w[256]; +} metric_t; +typedef union { + unsigned long w[8]; +} decision_t; -static union { unsigned char c[128]; } Branchtab39[3]; +static union { + unsigned char c[128]; +} Branchtab39[3]; static int Init = 0; /* State info for instance of Viterbi decoder */ @@ -18,19 +24,23 @@ struct v39 { metric_t metrics1; /* path metric buffer 1 */ metric_t metrics2; /* path metric buffer 2 */ decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ decision_t *decisions; /* Beginning of decisions for block */ }; /* Initialize Viterbi decoder for start of new frame */ -int init_viterbi39_port(void *p,int starting_state){ +int init_viterbi39_port(void *p, int starting_state) +{ struct v39 *vp = p; int i; - if(p == NULL) + if (p == NULL) { return -1; - for(i=0;i<256;i++) + } + for (i = 0; i < 256; i++) { vp->metrics1.w[i] = 63; + } vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; @@ -39,33 +49,40 @@ int init_viterbi39_port(void *p,int starting_state){ return 0; } -void set_viterbi39_polynomial_port(int polys[3]){ +void set_viterbi39_polynomial_port(int polys[3]) +{ int state; - for(state=0;state < 128;state++){ - Branchtab39[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; - Branchtab39[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; - Branchtab39[2].c[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0; + for (state = 0; state < 128; state++) { + Branchtab39[0].c[state] = (polys[0] < 0) ^ parity((2 * state) & abs( + polys[0])) ? 255 : 0; + Branchtab39[1].c[state] = (polys[1] < 0) ^ parity((2 * state) & abs( + polys[1])) ? 255 : 0; + Branchtab39[2].c[state] = (polys[2] < 0) ^ parity((2 * state) & abs( + polys[2])) ? 255 : 0; } Init++; } /* Create a new instance of a Viterbi decoder */ -void *create_viterbi39_port(int len){ +void *create_viterbi39_port(int len) +{ struct v39 *vp; - if(!Init){ - int polys[3] = {V39POLYA,V39POLYB,V39POLYC}; + if (!Init) { + int polys[3] = {V39POLYA, V39POLYB, V39POLYC}; set_viterbi39_polynomial_port(polys); } - if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL) + if ((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL) { return NULL; + } - if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + if ((vp->decisions = (decision_t *)malloc((len + 8) * sizeof( + decision_t))) == NULL) { free(vp); return NULL; } - init_viterbi39_port(vp,0); + init_viterbi39_port(vp, 0); return vp; } @@ -73,15 +90,17 @@ void *create_viterbi39_port(int len){ /* Viterbi chainback */ int chainback_viterbi39_port( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ struct v39 *vp = p; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; + } d = vp->decisions; /* Make room beyond the end of the encoder register so we can @@ -94,21 +113,22 @@ int chainback_viterbi39_port( * combine in the cache anyway */ d += 8; /* Look past tail */ - while(nbits-- != 0){ + while (nbits-- != 0) { int k; - - k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1; - data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + + k = (d[nbits].w[(endstate) / 32] >> (endstate % 32)) & 1; + data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7); } return 0; } /* Delete instance of a Viterbi decoder */ -void delete_viterbi39_port(void *p){ +void delete_viterbi39_port(void *p) +{ struct v39 *vp = p; - if(vp != NULL){ + if (vp != NULL) { free(vp->decisions); free(vp); } @@ -136,33 +156,37 @@ unsigned int metric,m0,m1,decision;\ * of symbols! */ -int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits){ +int update_viterbi39_blk_port(void *p, unsigned char *syms, int nbits) +{ struct v39 *vp = p; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; - + } + d = (decision_t *)vp->dp; - while(nbits--){ + while (nbits--) { void *tmp; - unsigned char sym0,sym1,sym2; + unsigned char sym0, sym1, sym2; int i; - for(i=0;i<8;i++) + for (i = 0; i < 8; i++) { d->w[i] = 0; + } sym0 = *syms++; sym1 = *syms++; sym2 = *syms++; - for(i=0;i<128;i++) + for (i = 0; i < 128; i++) { BFLY(i); + } d++; tmp = vp->old_metrics; vp->old_metrics = vp->new_metrics; vp->new_metrics = tmp; - } + } vp->dp = d; return 0; } diff --git a/lib/libfec/viterbi39_sse.c b/lib/libfec/viterbi39_sse.c new file mode 100644 index 0000000..c1690d1 --- /dev/null +++ b/lib/libfec/viterbi39_sse.c @@ -0,0 +1,233 @@ +/* K=9 r=1/3 Viterbi decoder for x86 SSE + * Copyright Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include +#include "fec.h" + +typedef union { + unsigned long w[8]; + unsigned char c[32]; +} decision_t; +typedef union { + signed short s[256]; + __m64 v[64]; +} metric_t; + +static union branchtab39 { + unsigned short s[128]; + __m64 v[32]; +} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_sse(void *p, int starting_state) +{ + struct v39 *vp = p; + int i; + + if (p == NULL) { + return -1; + } + for (i = 0; i < 256; i++) { + vp->metrics1.s[i] = (SHRT_MIN + 1000); + } + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = + SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_sse(int len) +{ + struct v39 *vp; + + if (!Init) { + int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; + + set_viterbi39_polynomial_sse(polys); + } + if ((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL) { + return NULL; + } + if ((vp->decisions = malloc((len + 8) * sizeof(decision_t))) == NULL) { + free(vp); + return NULL; + } + init_viterbi39_sse(vp, 0); + return vp; +} + +void set_viterbi39_polynomial_sse(int polys[3]) +{ + int state; + + for (state = 0; state < 128; state++) { + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2 * state) & polys[0]) ? 255 + : 0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2 * state) & polys[1]) ? 255 + : 0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2 * state) & polys[2]) ? 255 + : 0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi39_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + struct v39 *vp = p; + decision_t *d; + int path_metric; + + if (p == NULL) { + return -1; + } + d = (decision_t *)vp->decisions; + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while (nbits-- != 0) { + int k; + + /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/ + k = (d[nbits].c[endstate / 8] >> (endstate % 8)) & 1; + endstate = (k << 7) | (endstate >> 1); + data[nbits >> 3] = endstate; + } + return path_metric - SHRT_MIN; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_sse(void *p) +{ + struct v39 *vp = p; + + if (vp != NULL) { + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi39_blk_sse(void *p, unsigned char *syms, int nbits) +{ + struct v39 *vp = p; + decision_t *d; + int path_metric = 0; + + if (p == NULL) { + return -1; + } + d = (decision_t *)vp->dp; + while (nbits--) { + __m64 sym0v, sym1v, sym2v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + syms += 3; + + for (i = 0; i < 32; i++) { + __m64 decision0, decision1, metric, m_metric, m0, m1, m2, m3, survivor0, + survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-765 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i], sym0v), + _mm_xor_si64(Branchtab39[1].v[i], sym1v)); + metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i], sym2v), m0); + m_metric = _mm_sub_pi16(_mm_set1_pi16(765), metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_pi16(vp->old_metrics->v[i], metric); + m3 = _mm_adds_pi16(vp->old_metrics->v[32 + i], metric); + m1 = _mm_adds_pi16(vp->old_metrics->v[32 + i], m_metric); + m2 = _mm_adds_pi16(vp->old_metrics->v[i], m_metric); + + /* Compare and select */ + survivor0 = _mm_min_pi16(m0, m1); + survivor1 = _mm_min_pi16(m2, m3); + decision0 = _mm_cmpeq_pi16(survivor0, m1); + decision1 = _mm_cmpeq_pi16(survivor1, m3); + + /* Pack decisions into 8 bits and store */ + d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0, + _mm_setzero_si64()), _mm_packs_pi16(decision1, _mm_setzero_si64()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2 * i] = _mm_unpacklo_pi16(survivor0, survivor1); + vp->new_metrics->v[2 * i + 1] = _mm_unpackhi_pi16(survivor0, survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-255 branch metrics is 12750 + */ + if (vp->new_metrics->s[0] >= SHRT_MAX - 5000) { + int i, adjust; + __m64 adjustv; + union { + __m64 v; + signed short w[4]; + } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for (i = 1; i < 64; i++) { + adjustv = _mm_min_pi16(adjustv, vp->new_metrics->v[i]); + } + + adjustv = _mm_min_pi16(adjustv, _mm_srli_si64(adjustv, 32)); + adjustv = _mm_min_pi16(adjustv, _mm_srli_si64(adjustv, 16)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + path_metric += adjust; + adjustv = _mm_set1_pi16(adjust); + + for (i = 0; i < 64; i++) { + vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i], adjustv); + } + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return path_metric; +} diff --git a/libfec/viterbi39_sse2.c b/lib/libfec/viterbi39_sse2.c similarity index 50% rename from libfec/viterbi39_sse2.c rename to lib/libfec/viterbi39_sse2.c index f13794e..97367aa 100644 --- a/libfec/viterbi39_sse2.c +++ b/lib/libfec/viterbi39_sse2.c @@ -9,10 +9,19 @@ #include #include "fec.h" -typedef union { unsigned long w[8]; unsigned short s[16];} decision_t; -typedef union { signed short s[256]; __m128i v[32];} metric_t; +typedef union { + unsigned long w[8]; + unsigned short s[16]; +} decision_t; +typedef union { + signed short s[256]; + __m128i v[32]; +} metric_t; -static union branchtab39 { unsigned short s[128]; __m128i v[16];} Branchtab39[3]; +static union branchtab39 { + unsigned short s[128]; + __m128i v[16]; +} Branchtab39[3]; static int Init = 0; /* State info for instance of Viterbi decoder */ @@ -20,66 +29,77 @@ struct v39 { metric_t metrics1; /* path metric buffer 1 */ metric_t metrics2; /* path metric buffer 2 */ void *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ void *decisions; /* Beginning of decisions for block */ }; /* Initialize Viterbi decoder for start of new frame */ -int init_viterbi39_sse2(void *p,int starting_state){ +int init_viterbi39_sse2(void *p, int starting_state) +{ struct v39 *vp = p; int i; - for(i=0;i<256;i++) - vp->metrics1.s[i] = (SHRT_MIN+1000); + for (i = 0; i < 256; i++) { + vp->metrics1.s[i] = (SHRT_MIN + 1000); + } vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; vp->dp = vp->decisions; - vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */ + vp->old_metrics->s[starting_state & 255] = + SHRT_MIN; /* Bias known start state */ return 0; } /* Create a new instance of a Viterbi decoder */ -void *create_viterbi39_sse2(int len){ +void *create_viterbi39_sse2(int len) +{ void *p; struct v39 *vp; - if(!Init){ + if (!Init) { int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; set_viterbi39_polynomial_sse2(polys); } /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ - if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v39))) + if (posix_memalign(&p, sizeof(__m128i), sizeof(struct v39))) { return NULL; + } vp = (struct v39 *)p; - if((p = malloc((len+8)*sizeof(decision_t))) == NULL){ + if ((p = malloc((len + 8) * sizeof(decision_t))) == NULL) { free(vp); return NULL; } vp->decisions = (decision_t *)p; - init_viterbi39_sse2(vp,0); + init_viterbi39_sse2(vp, 0); return vp; } -void set_viterbi39_polynomial_sse2(int polys[3]){ +void set_viterbi39_polynomial_sse2(int polys[3]) +{ int state; - for(state=0;state < 128;state++){ - Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; - Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; - Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; + for (state = 0; state < 128; state++) { + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2 * state) & polys[0]) ? 255 + : 0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2 * state) & polys[1]) ? 255 + : 0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2 * state) & polys[2]) ? 255 + : 0; } Init++; } /* Viterbi chainback */ int chainback_viterbi39_sse2( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ struct v39 *vp = p; decision_t *d = (decision_t *)vp->decisions; int path_metric; @@ -93,34 +113,36 @@ int chainback_viterbi39_sse2( * combine in the cache anyway */ d += 8; /* Look past tail */ - while(nbits-- != 0){ + while (nbits-- != 0) { int k; - k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1; + k = (d[nbits].w[endstate / 32] >> (endstate % 32)) & 1; endstate = (k << 7) | (endstate >> 1); - data[nbits>>3] = endstate; + data[nbits >> 3] = endstate; } return path_metric; } /* Delete instance of a Viterbi decoder */ -void delete_viterbi39_sse2(void *p){ +void delete_viterbi39_sse2(void *p) +{ struct v39 *vp = p; - if(vp != NULL){ + if (vp != NULL) { free(vp->decisions); free(vp); } } -int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){ +int update_viterbi39_blk_sse2(void *p, unsigned char *syms, int nbits) +{ struct v39 *vp = p; decision_t *d = (decision_t *)vp->dp; int path_metric = 0; - while(nbits--){ - __m128i sym0v,sym1v,sym2v; + while (nbits--) { + __m128i sym0v, sym1v, sym2v; void *tmp; int i; @@ -131,51 +153,58 @@ int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){ syms += 3; /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */ - for(i=0;i<16;i++){ - __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + for (i = 0; i < 16; i++) { + __m128i decision0, decision1, metric, m_metric, m0, m1, m2, m3, survivor0, + survivor1; /* Form branch metrics * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, * the XOR operations constitute conditional negation. * metric and m_metric (-metric) are in the range 0-765 */ - m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i],sym0v),_mm_xor_si128(Branchtab39[1].v[i],sym1v)); - metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i],sym2v),m0); - m_metric = _mm_sub_epi16(_mm_set1_epi16(765),metric); - + m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i], sym0v), + _mm_xor_si128(Branchtab39[1].v[i], sym1v)); + metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i], sym2v), m0); + m_metric = _mm_sub_epi16(_mm_set1_epi16(765), metric); + /* Add branch metrics to path metrics */ - m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric); - m3 = _mm_adds_epi16(vp->old_metrics->v[16+i],metric); - m1 = _mm_adds_epi16(vp->old_metrics->v[16+i],m_metric); - m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric); - + m0 = _mm_adds_epi16(vp->old_metrics->v[i], metric); + m3 = _mm_adds_epi16(vp->old_metrics->v[16 + i], metric); + m1 = _mm_adds_epi16(vp->old_metrics->v[16 + i], m_metric); + m2 = _mm_adds_epi16(vp->old_metrics->v[i], m_metric); + /* Compare and select */ - survivor0 = _mm_min_epi16(m0,m1); - survivor1 = _mm_min_epi16(m2,m3); - decision0 = _mm_cmpeq_epi16(survivor0,m1); - decision1 = _mm_cmpeq_epi16(survivor1,m3); - + survivor0 = _mm_min_epi16(m0, m1); + survivor1 = _mm_min_epi16(m2, m3); + decision0 = _mm_cmpeq_epi16(survivor0, m1); + decision1 = _mm_cmpeq_epi16(survivor1, m3); + /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */ - d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128()))); + d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0, + _mm_setzero_si128()), _mm_packs_epi16(decision1, _mm_setzero_si128()))); /* Store surviving metrics */ - vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1); - vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1); + vp->new_metrics->v[2 * i] = _mm_unpacklo_epi16(survivor0, survivor1); + vp->new_metrics->v[2 * i + 1] = _mm_unpackhi_epi16(survivor0, survivor1); } /* See if we need to renormalize */ - if(vp->new_metrics->s[0] >= SHRT_MAX-5000){ - int i,adjust; + if (vp->new_metrics->s[0] >= SHRT_MAX - 5000) { + int i, adjust; __m128i adjustv; - union { __m128i v; signed short w[8]; } t; - + union { + __m128i v; + signed short w[8]; + } t; + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ adjustv = vp->new_metrics->v[0]; - for(i=1;i<32;i++) - adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]); + for (i = 1; i < 32; i++) { + adjustv = _mm_min_epi16(adjustv, vp->new_metrics->v[i]); + } - adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8)); - adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4)); - adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2)); + adjustv = _mm_min_epi16(adjustv, _mm_srli_si128(adjustv, 8)); + adjustv = _mm_min_epi16(adjustv, _mm_srli_si128(adjustv, 4)); + adjustv = _mm_min_epi16(adjustv, _mm_srli_si128(adjustv, 2)); t.v = adjustv; adjust = t.w[0] - SHRT_MIN; path_metric += adjust; @@ -184,8 +213,9 @@ int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){ /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX * This is okay since it can't overflow anyway */ - for(i=0;i<32;i++) - vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv); + for (i = 0; i < 32; i++) { + vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i], adjustv); + } } d++; /* Swap pointers to old and new metrics */ diff --git a/lib/libfec/viterbi615.c b/lib/libfec/viterbi615.c new file mode 100644 index 0000000..0df2f13 --- /dev/null +++ b/lib/libfec/viterbi615.c @@ -0,0 +1,187 @@ +/* K=15 r=1/6 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615(int len) +{ + + find_cpu_mode(); + + switch (Cpu_mode) { + case PORT: + default: + return create_viterbi615_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi615_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi615_mmx(len); + case SSE: + return create_viterbi615_sse(len); + case SSE2: + return create_viterbi615_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi615_port(len); +#endif + } +} + +void set_viterbi615_polynomial(int polys[6]) +{ + + switch (Cpu_mode) { + case PORT: + default: + set_viterbi615_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi615_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi615_polynomial_mmx(polys); + break; + case SSE: + set_viterbi615_polynomial_sse(polys); + break; + case SSE2: + set_viterbi615_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi615_polynomial_port(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615(void *p, int starting_state) +{ + switch (Cpu_mode) { + case PORT: + default: + return init_viterbi615_port(p, starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi615_av(p, starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi615_mmx(p, starting_state); + case SSE: + return init_viterbi615_sse(p, starting_state); + case SSE2: + return init_viterbi615_sse2(p, starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi615_port(p, starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi615( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + + switch (Cpu_mode) { + case PORT: + default: + return chainback_viterbi615_port(p, data, nbits, endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi615_av(p, data, nbits, endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi615_mmx(p, data, nbits, endstate); + case SSE: + return chainback_viterbi615_sse(p, data, nbits, endstate); + case SSE2: + return chainback_viterbi615_sse2(p, data, nbits, endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi615_port(p, data, nbits, endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615(void *p) +{ + switch (Cpu_mode) { + case PORT: + default: + delete_viterbi615_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi615_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi615_mmx(p); + break; + case SSE: + delete_viterbi615_sse(p); + break; + case SSE2: + delete_viterbi615_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi615_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi615_blk(void *p, unsigned char syms[], int nbits) +{ + switch (Cpu_mode) { + case PORT: + default: + return update_viterbi615_blk_port(p, syms, nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi615_blk_av(p, syms, nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi615_blk_mmx(p, syms, nbits); + case SSE: + return update_viterbi615_blk_sse(p, syms, nbits); + case SSE2: + return update_viterbi615_blk_sse2(p, syms, nbits); +#endif +#ifdef __x86_64__ + case SSE2: + return update_viterbi615_blk_port(p, syms, nbits); +#endif + } +} + diff --git a/lib/libfec/viterbi615_av.c b/lib/libfec/viterbi615_av.c new file mode 100644 index 0000000..6275510 --- /dev/null +++ b/lib/libfec/viterbi615_av.c @@ -0,0 +1,295 @@ +/* K=15 r=1/6 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions + * 8-bit offset-binary soft decision samples + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { + unsigned char c[128][16]; + vector unsigned char v[128]; +} decision_t; +typedef union { + unsigned short s[16384]; + vector unsigned short v[2048]; +} metric_t; + +static union branchtab615 { + unsigned short s[8192]; + vector unsigned short v[1024]; +} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_av(void *p, int starting_state) +{ + struct v615 *vp = p; + int i; + + if (p == NULL) { + return -1; + } + + for (i = 0; i < 2048; i++) { + vp->metrics1.v[i] = (vector unsigned short)(5000); + } + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_av(int len) +{ + struct v615 *vp; + + if (!Init) { + int polys[6] = { V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF }; + set_viterbi615_polynomial_av(polys); + } + vp = (struct v615 *)malloc(sizeof(struct v615)); + vp->decisions = malloc(sizeof(decision_t) * (len + 14)); + init_viterbi615_av(vp, 0); + return vp; +} + +void set_viterbi615_polynomial_av(int polys[6]) +{ + int state; + int i; + + for (state = 0; state < 8192; state++) { + for (i = 0; i < 6; i++) { + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2 * state) & abs( + polys[i])) ? 255 : 0; + } + } + Init++; +} + + +/* Viterbi chainback */ +int chainback_viterbi615_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + int path_metric; + + endstate %= 16384; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while (nbits-- != 0) { + int k; + + k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> (( + endstate >> 4) & 7))) ? 1 : 0; + endstate = (k << 13) | (endstate >> 1); + data[nbits >> 3] = endstate >> 6; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_av(void *p) +{ + struct v615 *vp = p; + + if (vp != NULL) { + free(vp->decisions); + free(vp); + } +} + +int update_viterbi615_blk_av(void *p, unsigned char *syms, int nbits) +{ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->dp; + int path_metric = 0; + vector unsigned char decisions = (vector unsigned char)(0); + + while (nbits--) { + vector unsigned short symv, sym0v, sym1v, sym2v, sym3v, sym4v, sym5v; + vector unsigned char s; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + s = (vector unsigned char)vec_perm(vec_ld(0, syms), vec_ld(5, syms), vec_lvsl(0, + syms)); + + symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0), + s); /* Unsigned byte->word unpack */ + sym0v = vec_splat(symv, 0); + sym1v = vec_splat(symv, 1); + sym2v = vec_splat(symv, 2); + sym3v = vec_splat(symv, 3); + sym4v = vec_splat(symv, 4); + sym5v = vec_splat(symv, 5); + syms += 6; + + for (i = 0; i < 1024; i++) { + vector bool short decision0, decision1; + vector unsigned short metric, m_metric, m0, m1, m2, m3, survivor0, survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = vec_add(vec_xor(Branchtab615[0].v[i], sym0v), vec_xor(Branchtab615[1].v[i], + sym1v)); + m1 = vec_add(vec_xor(Branchtab615[2].v[i], sym2v), vec_xor(Branchtab615[3].v[i], + sym3v)); + m2 = vec_add(vec_xor(Branchtab615[4].v[i], sym4v), vec_xor(Branchtab615[5].v[i], + sym5v)); + metric = vec_add(m0, m1); + metric = vec_add(metric, m2); + m_metric = vec_sub((vector unsigned short)(1530), metric); + + /* Add branch metrics to path metrics */ + m0 = vec_adds(vp->old_metrics->v[i], metric); + m3 = vec_adds(vp->old_metrics->v[1024 + i], metric); + m1 = vec_adds(vp->old_metrics->v[1024 + i], m_metric); + m2 = vec_adds(vp->old_metrics->v[i], m_metric); + + /* Compare and select */ + decision0 = vec_cmpgt(m0, m1); + decision1 = vec_cmpgt(m2, m3); + survivor0 = vec_min(m0, m1); + survivor1 = vec_min(m2, m3); + + /* Store decisions and survivors. + * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in + * a funny interleaved fashion that we undo in the chainback function. + */ + decisions = vec_add(decisions, + decisions); /* Shift each byte 1 bit to the left */ + + /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting + * 0xff is equivalent to adding 1, which sets the lsb. + */ + decisions = vec_sub(decisions, + (vector unsigned char)vec_pack(vec_mergeh(decision0, decision1), + vec_mergel(decision0, decision1))); + + vp->new_metrics->v[2 * i] = vec_mergeh(survivor0, survivor1); + vp->new_metrics->v[2 * i + 1] = vec_mergel(survivor0, survivor1); + + if ((i % 8) == 7) { + /* We've accumulated a total of 128 decisions, stash and start again */ + d->v[i >> 3] = + decisions; /* No need to clear, the new bits will replace the old */ + } + } +#if 0 + /* Experimentally determine metric spread + * The results are fixed for a given code and input symbol size + */ + { + int i; + vector unsigned short min_metric; + vector unsigned short max_metric; + union { + vector unsigned short v; + unsigned short s[8]; + } t; + int minimum, maximum; + static int max_spread = 0; + + min_metric = max_metric = vp->new_metrics->v[0]; + for (i = 1; i < 2048; i++) { + min_metric = vec_min(min_metric, vp->new_metrics->v[i]); + max_metric = vec_max(max_metric, vp->new_metrics->v[i]); + } + min_metric = vec_min(min_metric, vec_sld(min_metric, min_metric, 8)); + max_metric = vec_max(max_metric, vec_sld(max_metric, max_metric, 8)); + min_metric = vec_min(min_metric, vec_sld(min_metric, min_metric, 4)); + max_metric = vec_max(max_metric, vec_sld(max_metric, max_metric, 4)); + min_metric = vec_min(min_metric, vec_sld(min_metric, min_metric, 2)); + max_metric = vec_max(max_metric, vec_sld(max_metric, max_metric, 2)); + + t.v = min_metric; + minimum = t.s[0]; + t.v = max_metric; + maximum = t.s[0]; + if (maximum - minimum > max_spread) { + max_spread = maximum - minimum; + printf("metric spread = %d\n", max_spread); + } + } +#endif + + /* Renormalize if necessary. This deserves some explanation. + + * The maximum possible spread, found by experiment, for 4-bit symbols is 405; for 8 bit symbols, it's 12750. + * So by looking at one arbitrary metric we can tell if any of them have possibly saturated. + * However, this is very conservative. Large spreads occur only at very high Eb/No, where + * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor. + + * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric + * by not not normalizing when we should are extremely low. So either way, the risk to performance is small. + + * All this is borne out by experiment. + */ + if (vp->new_metrics->s[0] >= USHRT_MAX - 12750) { + vector unsigned short scale; + union { + vector unsigned short v; + unsigned short s[8]; + } t; + + /* Find smallest metric and splat */ + scale = vp->new_metrics->v[0]; + for (i = 1; i < 2048; i++) { + scale = vec_min(scale, vp->new_metrics->v[i]); + } + + scale = vec_min(scale, vec_sld(scale, scale, 8)); + scale = vec_min(scale, vec_sld(scale, scale, 4)); + scale = vec_min(scale, vec_sld(scale, scale, 2)); + + /* Subtract it from all metrics + * Work backwards to try to improve the cache hit ratio, assuming LRU + */ + for (i = 2047; i >= 0; i--) { + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i], scale); + } + t.v = scale; + path_metric += t.s[0]; + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return path_metric; +} diff --git a/lib/libfec/viterbi615_mmx.c b/lib/libfec/viterbi615_mmx.c new file mode 100644 index 0000000..e88b291 --- /dev/null +++ b/lib/libfec/viterbi615_mmx.c @@ -0,0 +1,213 @@ +/* K=15 r=1/6 Viterbi decoder for x86 MMX + * Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { + unsigned char c[16384]; + __m64 v[2048]; +} decision_t; +typedef union { + unsigned short s[16384]; + __m64 v[4096]; +} metric_t; + +static union branchtab615 { + unsigned short s[8192]; + __m64 v[2048]; +} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_mmx(void *p, int starting_state) +{ + struct v615 *vp = p; + int i; + + if (p == NULL) { + return -1; + } + for (i = 0; i < 16384; i++) { + vp->metrics1.s[i] = 5000; + } + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_mmx(int len) +{ + struct v615 *vp; + + if (!Init) { + int polys[6] = { V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF }; + set_viterbi615_polynomial_mmx(polys); + } + + if ((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL) { + return NULL; + } + if ((vp->decisions = malloc((len + 14) * sizeof(decision_t))) == NULL) { + free(vp); + return NULL; + } + init_viterbi615_mmx(vp, 0); + return vp; +} + +void set_viterbi615_polynomial_mmx(int polys[6]) +{ + int state; + int i; + + for (state = 0; state < 8192; state++) { + for (i = 0; i < 6; i++) { + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2 * state) & abs( + polys[i])) ? 255 : 0; + } + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + struct v615 *vp = p; + decision_t *d; + + if (p == NULL) { + return -1; + } + + d = (decision_t *)vp->decisions; + + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while (nbits-- != 0) { + int k; + + k = d[nbits].c[endstate] & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits >> 3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_mmx(void *p) +{ + struct v615 *vp = p; + + if (vp != NULL) { + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_mmx(void *p, unsigned char *syms, int nbits) +{ + struct v615 *vp = p; + decision_t *d; + + if (p == NULL) { + return -1; + } + + d = (decision_t *)vp->dp; + + while (nbits--) { + __m64 sym0v, sym1v, sym2v, sym3v, sym4v, sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + sym3v = _mm_set1_pi16(syms[3]); + sym4v = _mm_set1_pi16(syms[4]); + sym5v = _mm_set1_pi16(syms[5]); + syms += 6; + + for (i = 0; i < 2048; i++) { + __m64 decision0, decision1, metric, m_metric, m0, m1, m2, m3, survivor0, + survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i], sym0v), + _mm_xor_si64(Branchtab615[1].v[i], sym1v)); + m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i], sym2v), + _mm_xor_si64(Branchtab615[3].v[i], sym3v)); + m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i], sym4v), + _mm_xor_si64(Branchtab615[5].v[i], sym5v)); + metric = _mm_add_pi16(m0, _mm_add_pi16(m1, m2)); + m_metric = _mm_sub_pi16(_mm_set1_pi16(1530), metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_pi16(vp->old_metrics->v[i], metric); + m3 = _mm_add_pi16(vp->old_metrics->v[2048 + i], metric); + m1 = _mm_add_pi16(vp->old_metrics->v[2048 + i], m_metric); + m2 = _mm_add_pi16(vp->old_metrics->v[i], m_metric); + + /* Compare and select + * There's no packed min instruction in MMX, so we use modulo arithmetic + * to form the decisions and then do the select the hard way + */ + decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0, m1), _mm_setzero_si64()); + decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2, m3), _mm_setzero_si64()); + survivor0 = _mm_or_si64(_mm_and_si64(decision0, m1), _mm_andnot_si64(decision0, + m0)); + survivor1 = _mm_or_si64(_mm_and_si64(decision1, m3), _mm_andnot_si64(decision1, + m2)); + + /* Merge decisions and store as bytes */ + d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0, _mm_setzero_si64()), + _mm_packs_pi16(decision1, _mm_setzero_si64())); + + /* Store surviving metrics */ + vp->new_metrics->v[2 * i] = _mm_unpacklo_pi16(survivor0, survivor1); + vp->new_metrics->v[2 * i + 1] = _mm_unpackhi_pi16(survivor0, survivor1); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return 0; +} diff --git a/libfec/viterbi615_port.c b/lib/libfec/viterbi615_port.c similarity index 60% rename from libfec/viterbi615_port.c rename to lib/libfec/viterbi615_port.c index 89bdd80..eb76d51 100644 --- a/libfec/viterbi615_port.c +++ b/lib/libfec/viterbi615_port.c @@ -8,10 +8,17 @@ #include #include "fec.h" -typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t; -typedef union { unsigned long w[16384]; } metric_t; +typedef union { + unsigned long w[512]; + unsigned char c[2048]; +} decision_t; +typedef union { + unsigned long w[16384]; +} metric_t; -static union branchtab615 { unsigned long w[8192]; } Branchtab615[6] __attribute__ ((aligned(16))); +static union branchtab615 { + unsigned long w[8192]; +} Branchtab615[6] __attribute__((aligned(16))); static int Init = 0; /* State info for instance of Viterbi decoder */ @@ -19,48 +26,57 @@ struct v615 { metric_t metrics1; /* path metric buffer 1 */ metric_t metrics2; /* path metric buffer 2 */ decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ decision_t *decisions; /* Beginning of decisions for block */ }; /* Create a new instance of a Viterbi decoder */ -void *create_viterbi615_port(int len){ +void *create_viterbi615_port(int len) +{ struct v615 *vp; - if(!Init){ - int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + if (!Init) { + int polys[6] = { V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF }; set_viterbi615_polynomial_port(polys); } - if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL) + if ((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL) { return NULL; - if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ + } + if ((vp->decisions = malloc((len + 14) * sizeof(decision_t))) == NULL) { free(vp); return NULL; } - init_viterbi615(vp,0); + init_viterbi615(vp, 0); return vp; } -void set_viterbi615_polynomial_port(int polys[6]){ +void set_viterbi615_polynomial_port(int polys[6]) +{ int state; int i; - for(state=0;state < 8192;state++){ - for(i=0;i<6;i++) - Branchtab615[i].w[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + for (state = 0; state < 8192; state++) { + for (i = 0; i < 6; i++) { + Branchtab615[i].w[state] = (polys[i] < 0) ^ parity((2 * state) & abs( + polys[i])) ? 255 : 0; + } } Init++; } /* Initialize Viterbi decoder for start of new frame */ -int init_viterbi615_port(void *p,int starting_state){ +int init_viterbi615_port(void *p, int starting_state) +{ struct v615 *vp = p; int i; - if(p == NULL) + if (p == NULL) { return -1; - for(i=0;i<16384;i++) + } + for (i = 0; i < 16384; i++) { vp->metrics1.w[i] = 1000; + } vp->old_metrics = &vp->metrics1; vp->new_metrics = &vp->metrics2; @@ -71,16 +87,18 @@ int init_viterbi615_port(void *p,int starting_state){ /* Viterbi chainback */ int chainback_viterbi615_port( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ struct v615 *vp = p; decision_t *d; - if(p == NULL) + if (p == NULL) { return -1; - d = (decision_t *)vp->decisions; + } + d = (decision_t *)vp->decisions; endstate %= 16384; /* The store into data[] only needs to be done every 8 bits. @@ -88,21 +106,22 @@ int chainback_viterbi615_port( * combine in the cache anyway */ d += 14; /* Look past tail */ - while(nbits-- != 0){ + while (nbits-- != 0) { int k; - k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + k = (d[nbits].c[endstate / 8] >> (endstate % 8)) & 1; endstate = (k << 13) | (endstate >> 1); - data[nbits>>3] = endstate >> 6; + data[nbits >> 3] = endstate >> 6; } return 0; } /* Delete instance of a Viterbi decoder */ -void delete_viterbi615_port(void *p){ +void delete_viterbi615_port(void *p) +{ struct v615 *vp = p; - if(vp != NULL){ + if (vp != NULL) { free(vp->decisions); free(vp); } @@ -112,8 +131,8 @@ void delete_viterbi615_port(void *p){ #define BFLY(i) {\ unsigned long metric,m0,m1,m2,m3,decision0,decision1;\ metric = ((Branchtab615[0].w[i] ^ syms[0]) + (Branchtab615[1].w[i] ^ syms[1])\ - +(Branchtab615[2].w[i] ^ syms[2]) + (Branchtab615[3].w[i] ^ syms[3])\ - +(Branchtab615[4].w[i] ^ syms[4]) + (Branchtab615[5].w[i] ^ syms[5]));\ + +(Branchtab615[2].w[i] ^ syms[2]) + (Branchtab615[3].w[i] ^ syms[3])\ + +(Branchtab615[4].w[i] ^ syms[4]) + (Branchtab615[5].w[i] ^ syms[5]));\ m0 = vp->old_metrics->w[i] + metric;\ m1 = vp->old_metrics->w[i+8192] + (1530 - metric);\ m2 = vp->old_metrics->w[i] + (1530-metric);\ @@ -129,19 +148,22 @@ unsigned long metric,m0,m1,m2,m3,decision0,decision1;\ * of symbols! */ -int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits){ +int update_viterbi615_blk_port(void *p, unsigned char *syms, int nbits) +{ struct v615 *vp = p; void *tmp; decision_t *d; int i; - if(p == NULL) + if (p == NULL) { return -1; + } d = (decision_t *)vp->dp; - while(nbits--){ - memset(d,0,sizeof(decision_t)); - for(i=0;i<8192;i++) + while (nbits--) { + memset(d, 0, sizeof(decision_t)); + for (i = 0; i < 8192; i++) { BFLY(i); + } syms += 6; d++; @@ -149,7 +171,7 @@ int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits){ tmp = vp->old_metrics; vp->old_metrics = vp->new_metrics; vp->new_metrics = tmp; - } + } vp->dp = d; return 0; } diff --git a/lib/libfec/viterbi615_sse.c b/lib/libfec/viterbi615_sse.c new file mode 100644 index 0000000..48b5775 --- /dev/null +++ b/lib/libfec/viterbi615_sse.c @@ -0,0 +1,234 @@ +/* K=15 r=1/6 Viterbi decoder for x86 SSE + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include +#include "fec.h" + +typedef union { + unsigned long w[512]; + unsigned char c[2048]; +} decision_t; +typedef union { + signed short s[16384]; + __m64 v[4096]; +} metric_t; + +static union branchtab615 { + unsigned short s[8192]; + __m64 v[2048]; +} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_sse(void *p, int starting_state) +{ + struct v615 *vp = p; + int i; + + if (p == NULL) { + return -1; + } + for (i = 0; i < 16384; i++) { + vp->metrics1.s[i] = (SHRT_MIN + 5000); + } + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = + SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_sse(int len) +{ + struct v615 *vp; + + if (!Init) { + int polys[6] = { V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF }; + set_viterbi615_polynomial_sse(polys); + } + + if ((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL) { + return NULL; + } + if ((vp->decisions = malloc((len + 14) * sizeof(decision_t))) == NULL) { + free(vp); + return NULL; + } + init_viterbi615_sse(vp, 0); + return vp; +} + +void set_viterbi615_polynomial_sse(int polys[6]) +{ + int state; + int i; + + for (state = 0; state < 8192; state++) { + for (i = 0; i < 6; i++) { + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2 * state) & abs( + polys[i])) ? 255 : 0; + } + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + struct v615 *vp = p; + decision_t *d; + + if (p == NULL) { + return -1; + } + d = (decision_t *)vp->decisions; + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while (nbits-- != 0) { + int k; + + /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/ + k = (d[nbits].c[endstate / 8] >> (endstate % 8)) & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits >> 3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_sse(void *p) +{ + struct v615 *vp = p; + + if (vp != NULL) { + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_sse(void *p, unsigned char *syms, int nbits) +{ + struct v615 *vp = p; + decision_t *d; + + if (p == NULL) { + return -1; + } + d = (decision_t *)vp->dp; + while (nbits--) { + __m64 sym0v, sym1v, sym2v, sym3v, sym4v, sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + sym3v = _mm_set1_pi16(syms[3]); + sym4v = _mm_set1_pi16(syms[4]); + sym5v = _mm_set1_pi16(syms[5]); + syms += 6; + + for (i = 0; i < 2048; i++) { + __m64 decision0, decision1, metric, m_metric, m0, m1, m2, m3, survivor0, + survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i], sym0v), + _mm_xor_si64(Branchtab615[1].v[i], sym1v)); + m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i], sym2v), + _mm_xor_si64(Branchtab615[3].v[i], sym3v)); + m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i], sym4v), + _mm_xor_si64(Branchtab615[5].v[i], sym5v)); + metric = _mm_add_pi16(m0, _mm_add_pi16(m1, m2)); + m_metric = _mm_sub_pi16(_mm_set1_pi16(1530), metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_pi16(vp->old_metrics->v[i], metric); + m3 = _mm_adds_pi16(vp->old_metrics->v[2048 + i], metric); + m1 = _mm_adds_pi16(vp->old_metrics->v[2048 + i], m_metric); + m2 = _mm_adds_pi16(vp->old_metrics->v[i], m_metric); + + /* Compare and select */ + survivor0 = _mm_min_pi16(m0, m1); + survivor1 = _mm_min_pi16(m2, m3); + decision0 = _mm_cmpeq_pi16(survivor0, m1); + decision1 = _mm_cmpeq_pi16(survivor1, m3); + + /* Pack decisions into 8 bits and store */ + d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0, + _mm_setzero_si64()), _mm_packs_pi16(decision1, _mm_setzero_si64()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2 * i] = _mm_unpacklo_pi16(survivor0, survivor1); + vp->new_metrics->v[2 * i + 1] = _mm_unpackhi_pi16(survivor0, survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-255 branch metrics is 12750 + */ + if (vp->new_metrics->s[0] >= SHRT_MAX - 12750) { + int i, adjust; + __m64 adjustv; + union { + __m64 v; + signed short w[4]; + } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for (i = 1; i < 4096; i++) { + adjustv = _mm_min_pi16(adjustv, vp->new_metrics->v[i]); + } + + adjustv = _mm_min_pi16(adjustv, _mm_srli_si64(adjustv, 32)); + adjustv = _mm_min_pi16(adjustv, _mm_srli_si64(adjustv, 16)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + adjustv = _mm_set1_pi16(adjust); + + for (i = 0; i < 4096; i++) { + vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i], adjustv); + } + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return 0; +} diff --git a/lib/libfec/viterbi615_sse2.c b/lib/libfec/viterbi615_sse2.c new file mode 100644 index 0000000..1438945 --- /dev/null +++ b/lib/libfec/viterbi615_sse2.c @@ -0,0 +1,236 @@ +/* K=15 r=1/6 Viterbi decoder for x86 SSE2 + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include +#include "fec.h" + +typedef union { + unsigned long w[512]; + unsigned short s[1024]; +} decision_t; +typedef union { + signed short s[16384]; + __m128i v[2048]; +} metric_t; + +static union branchtab615 { + unsigned short s[8192]; + __m128i v[1024]; +} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics, + *new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_sse2(void *p, int starting_state) +{ + struct v615 *vp = p; + int i; + + if (p == NULL) { + return -1; + } + for (i = 0; i < 16384; i++) { + vp->metrics1.s[i] = (SHRT_MIN + 5000); + } + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = + SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_sse2(int len) +{ + void *p; + struct v615 *vp; + + if (!Init) { + int polys[6] = { V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF }; + set_viterbi615_polynomial_sse2(polys); + } + + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if (posix_memalign(&p, sizeof(__m128i), sizeof(struct v615))) { + return NULL; + } + + vp = (struct v615 *)p; + if ((p = malloc((len + 14) * sizeof(decision_t))) == NULL) { + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi615_sse2(vp, 0); + return vp; +} + +void set_viterbi615_polynomial_sse2(int polys[6]) +{ + int state; + int i; + + for (state = 0; state < 8192; state++) { + for (i = 0; i < 6; i++) { + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2 * state) & abs( + polys[i])) ? 255 : 0; + } + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate) /* Terminal encoder state */ +{ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while (nbits-- != 0) { + int k; + + k = (d[nbits].w[endstate / 32] >> (endstate % 32)) & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits >> 3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_sse2(void *p) +{ + struct v615 *vp = p; + + if (vp != NULL) { + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_sse2(void *p, unsigned char *syms, int nbits) +{ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->dp; + + while (nbits--) { + __m128i sym0v, sym1v, sym2v, sym3v, sym4v, sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_epi16(syms[0]); + sym1v = _mm_set1_epi16(syms[1]); + sym2v = _mm_set1_epi16(syms[2]); + sym3v = _mm_set1_epi16(syms[3]); + sym4v = _mm_set1_epi16(syms[4]); + sym5v = _mm_set1_epi16(syms[5]); + syms += 6; + + /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */ + for (i = 0; i < 1024; i++) { + __m128i decision0, decision1, metric, m_metric, m0, m1, m2, m3, survivor0, + survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_epi16(_mm_xor_si128(Branchtab615[0].v[i], sym0v), + _mm_xor_si128(Branchtab615[1].v[i], sym1v)); + m1 = _mm_add_epi16(_mm_xor_si128(Branchtab615[2].v[i], sym2v), + _mm_xor_si128(Branchtab615[3].v[i], sym3v)); + m2 = _mm_add_epi16(_mm_xor_si128(Branchtab615[4].v[i], sym4v), + _mm_xor_si128(Branchtab615[5].v[i], sym5v)); + metric = _mm_add_epi16(m0, _mm_add_epi16(m1, m2)); + m_metric = _mm_sub_epi16(_mm_set1_epi16(1530), metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_epi16(vp->old_metrics->v[i], metric); + m3 = _mm_adds_epi16(vp->old_metrics->v[1024 + i], metric); + m1 = _mm_adds_epi16(vp->old_metrics->v[1024 + i], m_metric); + m2 = _mm_adds_epi16(vp->old_metrics->v[i], m_metric); + + /* Compare and select */ + survivor0 = _mm_min_epi16(m0, m1); + survivor1 = _mm_min_epi16(m2, m3); + decision0 = _mm_cmpeq_epi16(survivor0, m1); + decision1 = _mm_cmpeq_epi16(survivor1, m3); + + /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */ + d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0, + _mm_setzero_si128()), _mm_packs_epi16(decision1, _mm_setzero_si128()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2 * i] = _mm_unpacklo_epi16(survivor0, survivor1); + vp->new_metrics->v[2 * i + 1] = _mm_unpackhi_epi16(survivor0, survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-90 branch metrics is 405 + */ + if (vp->new_metrics->s[0] >= SHRT_MAX - 12750) { + int i, adjust; + __m128i adjustv; + union { + __m128i v; + signed short w[8]; + } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for (i = 1; i < 2048; i++) { + adjustv = _mm_min_epi16(adjustv, vp->new_metrics->v[i]); + } + + adjustv = _mm_min_epi16(adjustv, _mm_srli_si128(adjustv, 8)); + adjustv = _mm_min_epi16(adjustv, _mm_srli_si128(adjustv, 4)); + adjustv = _mm_min_epi16(adjustv, _mm_srli_si128(adjustv, 2)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + adjustv = _mm_set1_epi16(adjust); + + /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX + * This is okay since it can't overflow anyway + */ + for (i = 0; i < 2048; i++) { + vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i], adjustv); + } + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} + + diff --git a/lib/libfec/vtest27.c b/lib/libfec/vtest27.c new file mode 100644 index 0000000..f10842d --- /dev/null +++ b/lib/libfec/vtest27.c @@ -0,0 +1,194 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length", 1, NULL, 'l'}, + {"frame-count", 1, NULL, 'n'}, + {"ebn0", 1, NULL, 'e'}, + {"gain", 1, NULL, 'g'}, + {"verbose", 0, NULL, 'v'}, + {"force-altivec", 0, NULL, 'a'}, + {"force-port", 0, NULL, 'p'}, + {"force-mmx", 0, NULL, 'm'}, + {"force-sse", 0, NULL, 's'}, + {"force-sse2", 0, NULL, 't'}, + {NULL}, +}; +#endif + +#define RATE (1./2.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc, char *argv[]) +{ + int i, d, tr; + int sr = 0, trials = 10000, errcnt, framebits = 2048; + long long int tot_errs = 0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8 * 2 * (MAXBYTES + 6)]; + void *vp; + extern char *optarg; + struct rusage start, finish; + double extime; + double gain, esn0, ebn0; + time_t t; + int badframes = 0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while ((d = getopt_long(argc, argv, "l:n:te:g:vapmst", Options, NULL)) != EOF) { +#else + while ((d = getopt(argc, argv, "l:n:te:g:vapmst")) != EOF) { +#endif + switch (d) { + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if (framebits > 8 * MAXBYTES) { + fprintf(stderr, "Frame limited to %d bits\n", MAXBYTES * 8); + framebits = MAXBYTES * 8; + } + if ((vp = create_viterbi27(framebits)) == NULL) { + printf("create_viterbi27 failed\n"); + exit(1); + } + if (ebn0 != -100) { + esn0 = ebn0 + 10 * log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1. / sqrt(0.5 / pow(10., esn0 / 10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n", trials, + framebits, ebn0, Gain); + + for (tr = 0; tr < trials; tr++) { + /* Encode a frame of random data */ + for (i = 0; i < framebits + 6; i++) { + int bit = (i < framebits) ? (random() & 1) : 0; + + sr = (sr << 1) | bit; + bits[i / 8] = sr & 0xff; + symbols[2 * i + 0] = addnoise(parity(sr & V27POLYA), gain, Gain, 127.5, 255); + symbols[2 * i + 1] = addnoise(parity(sr & V27POLYB), gain, Gain, 127.5, 255); + } + /* Decode it and make sure we get the right answer */ + /* Initialize Viterbi decoder */ + init_viterbi27(vp, 0); + + /* Decode block */ + update_viterbi27_blk(vp, symbols, framebits + 6); + + /* Do Viterbi chainback */ + chainback_viterbi27(vp, data, framebits, 0); + errcnt = 0; + for (i = 0; i < framebits / 8; i++) { + int e = Bitcnt[xordata[i] = data[i] ^ bits[i]]; + errcnt += e; + tot_errs += e; + } + if (errcnt != 0) { + badframes++; + } + if (Verbose > 1 && errcnt != 0) { + printf("frame %d, %d errors: ", tr, errcnt); + for (i = 0; i < framebits / 8; i++) { + printf("%02x", xordata[i]); + } + printf("\n"); + } + if (Verbose) + printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r", + tot_errs, (long long)framebits * (tr + 1), + tot_errs / ((double)framebits * (tr + 1)), + badframes, tr + 1, (double)badframes / (tr + 1)); + fflush(stdout); + } + if (Verbose > 1) { + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n", trials, + framebits, ebn0, Gain); + } + else if (Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs, (long long)framebits * trials, + tot_errs / ((double)framebits * trials), + badframes, tr + 1, (double)badframes / (tr + 1)); + else { + printf("\n"); + } + + } + else { + /* Do time trials */ + memset(symbols, 127, sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF, &start); + for (tr = 0; tr < trials; tr++) { + /* Initialize Viterbi decoder */ + init_viterbi27(vp, 0); + + /* Decode block */ + update_viterbi27_blk(vp, symbols, framebits); + + /* Do Viterbi chainback */ + chainback_viterbi27(vp, data, framebits, 0); + } + getrusage(RUSAGE_SELF, &finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6 * + (finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n", trials, + framebits, extime); + printf("decoder speed: %g bits/s\n", trials * framebits / extime); + } + exit(0); +} diff --git a/lib/libfec/vtest29.c b/lib/libfec/vtest29.c new file mode 100644 index 0000000..b6f9b62 --- /dev/null +++ b/lib/libfec/vtest29.c @@ -0,0 +1,195 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length", 1, NULL, 'l'}, + {"frame-count", 1, NULL, 'n'}, + {"ebn0", 1, NULL, 'e'}, + {"gain", 1, NULL, 'g'}, + {"verbose", 0, NULL, 'v'}, + {"force-altivec", 0, NULL, 'a'}, + {"force-port", 0, NULL, 'p'}, + {"force-mmx", 0, NULL, 'm'}, + {"force-sse", 0, NULL, 's'}, + {"force-sse2", 0, NULL, 't'}, + {NULL}, +}; +#endif + +#define RATE (1./2.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc, char *argv[]) +{ + int i, d, tr; + int sr = 0, trials = 10000, errcnt, framebits = 2048; + long long tot_errs = 0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8 * 2 * (MAXBYTES + 8)]; + void *vp; + extern char *optarg; + struct rusage start, finish; + double extime; + double gain, esn0, ebn0; + time_t t; + int badframes = 0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while ((d = getopt_long(argc, argv, "l:n:te:g:vapmst", Options, NULL)) != EOF) { +#else + while ((d = getopt(argc, argv, "l:n:te:g:vapmst")) != EOF) { +#endif + switch (d) { + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if (framebits > 8 * MAXBYTES) { + fprintf(stderr, "Frame limited to %d bits\n", MAXBYTES * 8); + framebits = MAXBYTES * 8; + } + if ((vp = create_viterbi29(framebits)) == NULL) { + printf("create_viterbi29 failed\n"); + exit(1); + } + if (ebn0 != -100) { + esn0 = ebn0 + 10 * log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1. / sqrt(0.5 / pow(10., esn0 / 10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n", trials, + framebits, ebn0, Gain); + + for (tr = 0; tr < trials; tr++) { + /* Encode a frame of random data */ + for (i = 0; i < framebits + 8; i++) { + int bit = (i < framebits) ? (random() & 1) : 0; + + sr = (sr << 1) | bit; + bits[i / 8] = sr & 0xff; + symbols[2 * i + 0] = addnoise(parity(sr & V29POLYA), gain, Gain, 127.5, 255); + symbols[2 * i + 1] = addnoise(parity(sr & V29POLYB), gain, Gain, 127.5, 255); + } + /* Decode it and make sure we get the right answer */ + /* Initialize Viterbi decoder */ + init_viterbi29(vp, 0); + + /* Decode block */ + update_viterbi29_blk(vp, symbols, framebits + 8); + + /* Do Viterbi chainback */ + chainback_viterbi29(vp, data, framebits, 0); + errcnt = 0; + for (i = 0; i < framebits / 8; i++) { + int e = Bitcnt[xordata[i] = data[i] ^ bits[i]]; + errcnt += e; + tot_errs += e; + } + if (errcnt != 0) { + badframes++; + } + if (Verbose > 1 && errcnt != 0) { + printf("frame %d, %d errors: ", tr, errcnt); + for (i = 0; i < framebits / 8; i++) { + printf("%02x", xordata[i]); + } + printf("\n"); + } + if (Verbose) + printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r", + tot_errs, (long long)framebits * (tr + 1), + tot_errs / ((double)framebits * (tr + 1)), + badframes, tr + 1, (double)badframes / (tr + 1)); + fflush(stdout); + } + if (Verbose > 1) { + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n", trials, + framebits, ebn0, Gain); + } + else if (Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs, (long long)framebits * trials, + tot_errs / ((double)framebits * trials), + badframes, tr + 1, (double)badframes / (tr + 1)); + else { + printf("\n"); + } + } + else { + /* Do time trials */ + memset(symbols, 127, sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF, &start); + for (tr = 0; tr < trials; tr++) { + /* Initialize Viterbi decoder */ + init_viterbi29(vp, 0); + + /* Decode block */ + update_viterbi29_blk(vp, symbols, framebits); + + /* Do Viterbi chainback */ + chainback_viterbi29(vp, data, framebits, 0); + } + getrusage(RUSAGE_SELF, &finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6 * + (finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n", trials, + framebits, extime); + printf("decoder speed: %g bits/s\n", trials * framebits / extime); + } + exit(0); +} + + diff --git a/lib/libfec/vtest39.c b/lib/libfec/vtest39.c new file mode 100644 index 0000000..34c8330 --- /dev/null +++ b/lib/libfec/vtest39.c @@ -0,0 +1,196 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length", 1, NULL, 'l'}, + {"frame-count", 1, NULL, 'n'}, + {"ebn0", 1, NULL, 'e'}, + {"gain", 1, NULL, 'g'}, + {"verbose", 0, NULL, 'v'}, + {"force-altivec", 0, NULL, 'a'}, + {"force-port", 0, NULL, 'p'}, + {"force-mmx", 0, NULL, 'm'}, + {"force-sse", 0, NULL, 's'}, + {"force-sse2", 0, NULL, 't'}, + {NULL}, +}; +#endif + +#define RATE (1./3.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc, char *argv[]) +{ + int i, d, tr; + int sr = 0, trials = 10000, errcnt, framebits = 2048; + long long tot_errs = 0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8 * 3 * (MAXBYTES + 8)]; + void *vp; + extern char *optarg; + struct rusage start, finish; + double extime; + double gain, esn0, ebn0; + time_t t; + int badframes = 0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while ((d = getopt_long(argc, argv, "l:n:te:g:vapmst", Options, NULL)) != EOF) { +#else + while ((d = getopt(argc, argv, "l:n:te:g:vapmst")) != EOF) { +#endif + switch (d) { + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if (framebits > 8 * MAXBYTES) { + fprintf(stderr, "Frame limited to %d bits\n", MAXBYTES * 8); + framebits = MAXBYTES * 8; + } + if ((vp = create_viterbi39(framebits)) == NULL) { + printf("create_viterbi39 failed\n"); + exit(1); + } + if (ebn0 != -100) { + esn0 = ebn0 + 10 * log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1. / sqrt(0.5 / pow(10., esn0 / 10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n", trials, + framebits, ebn0, Gain); + + for (tr = 0; tr < trials; tr++) { + /* Encode a frame of random data */ + for (i = 0; i < framebits + 8; i++) { + int bit = (i < framebits) ? (random() & 1) : 0; + + sr = (sr << 1) | bit; + bits[i / 8] = sr & 0xff; + symbols[3 * i + 0] = addnoise(parity(sr & V39POLYA), gain, Gain, 127.5, 255); + symbols[3 * i + 1] = addnoise(parity(sr & V39POLYB), gain, Gain, 127.5, 255); + symbols[3 * i + 2] = addnoise(parity(sr & V39POLYC), gain, Gain, 127.5, 255); + } + /* Decode it and make sure we get the right answer */ + /* Initialize Viterbi decoder */ + init_viterbi39(vp, 0); + + /* Decode block */ + update_viterbi39_blk(vp, symbols, framebits + 8); + + /* Do Viterbi chainback */ + chainback_viterbi39(vp, data, framebits, 0); + errcnt = 0; + for (i = 0; i < framebits / 8; i++) { + int e = Bitcnt[xordata[i] = data[i] ^ bits[i]]; + errcnt += e; + tot_errs += e; + } + if (errcnt != 0) { + badframes++; + } + if (Verbose > 1 && errcnt != 0) { + printf("frame %d, %d errors: ", tr, errcnt); + for (i = 0; i < framebits / 8; i++) { + printf("%02x", xordata[i]); + } + printf("\n"); + } + if (Verbose) + printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r", + tot_errs, (long long)framebits * (tr + 1), + tot_errs / ((double)framebits * (tr + 1)), + badframes, tr + 1, (double)badframes / (tr + 1)); + fflush(stdout); + } + if (Verbose > 1) { + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n", trials, + framebits, ebn0, Gain); + } + else if (Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs, (long long)framebits * trials, + tot_errs / ((double)framebits * trials), + badframes, tr + 1, (double)badframes / (tr + 1)); + else { + printf("\n"); + } + } + else { + /* Do time trials */ + memset(symbols, 127, sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF, &start); + for (tr = 0; tr < trials; tr++) { + /* Initialize Viterbi decoder */ + init_viterbi39(vp, 0); + + /* Decode block */ + update_viterbi39_blk(vp, symbols, framebits); + + /* Do Viterbi chainback */ + chainback_viterbi39(vp, data, framebits, 0); + } + getrusage(RUSAGE_SELF, &finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6 * + (finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n", trials, + framebits, extime); + printf("decoder speed: %g bits/s\n", trials * framebits / extime); + } + exit(0); +} + + diff --git a/lib/libfec/vtest615.c b/lib/libfec/vtest615.c new file mode 100644 index 0000000..2e76287 --- /dev/null +++ b/lib/libfec/vtest615.c @@ -0,0 +1,199 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length", 1, NULL, 'l'}, + {"frame-count", 1, NULL, 'n'}, + {"ebn0", 1, NULL, 'e'}, + {"gain", 1, NULL, 'g'}, + {"verbose", 0, NULL, 'v'}, + {"force-altivec", 0, NULL, 'a'}, + {"force-port", 0, NULL, 'p'}, + {"force-mmx", 0, NULL, 'm'}, + {"force-sse", 0, NULL, 's'}, + {"force-sse2", 0, NULL, 't'}, + {NULL}, +}; +#endif + +#define RATE (1./6.) +#define MAXBYTES 10000 +#define OFFSET (127.5) +#define CLIP 255 + +double Gain = 24.0; +int Verbose = 0; + +int main(int argc, char *argv[]) +{ + int i, d, tr; + int sr = 0, trials = 10, errcnt, framebits = 2048; + int tot_errs = 0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8 * 6 * (MAXBYTES + 14)]; + void *vp; + extern char *optarg; + struct rusage start, finish; + double extime; + double gain, esn0, ebn0; + time_t t; + int badframes = 0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while ((d = getopt_long(argc, argv, "l:n:te:g:vapmst", Options, NULL)) != EOF) { +#else + while ((d = getopt(argc, argv, "l:n:te:g:vapmst")) != EOF) { +#endif + switch (d) { + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if (framebits > 8 * MAXBYTES) { + fprintf(stderr, "Frame limited to %d bits\n", MAXBYTES * 8); + framebits = MAXBYTES * 8; + } + if ((vp = create_viterbi615(framebits)) == NULL) { + printf("create_viterbi615 failed\n"); + exit(1); + } + if (ebn0 != -100) { + esn0 = ebn0 + 10 * log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1. / sqrt(0.5 / pow(10., esn0 / 10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n", trials, + framebits, ebn0, Gain); + + for (tr = 0; tr < trials; tr++) { + /* Encode a frame of random data */ + for (i = 0; i < framebits + 14; i++) { + int bit = (i < framebits) ? (random() & 1) : 0; + + sr = (sr << 1) | bit; + bits[i / 8] = sr & 0xff; + symbols[6 * i + 0] = addnoise(parity(sr & V615POLYA), gain, Gain, OFFSET, CLIP); + symbols[6 * i + 1] = addnoise(parity(sr & V615POLYB), gain, Gain, OFFSET, CLIP); + symbols[6 * i + 2] = addnoise(parity(sr & V615POLYC), gain, Gain, OFFSET, CLIP); + symbols[6 * i + 3] = addnoise(parity(sr & V615POLYD), gain, Gain, OFFSET, CLIP); + symbols[6 * i + 4] = addnoise(parity(sr & V615POLYE), gain, Gain, OFFSET, CLIP); + symbols[6 * i + 5] = addnoise(parity(sr & V615POLYF), gain, Gain, OFFSET, CLIP); + } + /* Decode it and make sure we get the right answer */ + /* Initialize Viterbi decoder */ + init_viterbi615(vp, 0); + + /* Decode block */ + update_viterbi615_blk(vp, symbols, framebits + 14); + + /* Do Viterbi chainback */ + chainback_viterbi615(vp, data, framebits, 0); + errcnt = 0; + for (i = 0; i < framebits / 8; i++) { + int e = Bitcnt[xordata[i] = data[i] ^ bits[i]]; + errcnt += e; + tot_errs += e; + } + if (errcnt != 0) { + badframes++; + } + if (Verbose > 1 && errcnt != 0) { + printf("frame %d, %d errors: ", tr, errcnt); + for (i = 0; i < framebits / 8; i++) { + printf("%02x", xordata[i]); + } + printf("\n"); + } + if (Verbose) + printf("BER %d/%d (%10.3g) FER %d/%d (%10.3g)\r", + tot_errs, framebits * (tr + 1), tot_errs / ((double)framebits * (tr + 1)), + badframes, (tr + 1), (double)badframes / (tr + 1)); + fflush(stdout); + + } + + if (Verbose > 1) { + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n", trials, + framebits, ebn0, Gain); + } + else if (Verbose == 0) + printf("BER %d/%d (%.3g) FER %d/%d (%.3g)\n", + tot_errs, framebits * (tr + 1), tot_errs / ((double)framebits * (tr + 1)), + badframes, (tr + 1), (double)badframes / (tr + 1)); + else { + printf("\n"); + } + } + else { + /* Do time trials */ + memset(symbols, 127, sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF, &start); + for (tr = 0; tr < trials; tr++) { + /* Initialize Viterbi decoder */ + init_viterbi615(vp, 0); + + /* Decode block */ + update_viterbi615_blk(vp, symbols, framebits + 14); + + /* Do Viterbi chainback */ + chainback_viterbi615(vp, data, framebits, 0); + } + getrusage(RUSAGE_SELF, &finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6 * + (finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n", trials, + framebits, extime); + printf("decoder speed: %g bits/s\n", trials * framebits / extime); + } + exit(0); +} diff --git a/lib/lrpt_decoder_impl.cc b/lib/lrpt_decoder_impl.cc index 0d1cace..8afdcb8 100644 --- a/lib/lrpt_decoder_impl.cc +++ b/lib/lrpt_decoder_impl.cc @@ -26,11 +26,7 @@ #include "lrpt_decoder_impl.h" #include #include - - -extern "C" { -#include -} +#include namespace gr { namespace satnogs { diff --git a/libfec/cpu_mode_ppc.c b/libfec/cpu_mode_ppc.c deleted file mode 100644 index 0071558..0000000 --- a/libfec/cpu_mode_ppc.c +++ /dev/null @@ -1,40 +0,0 @@ -/* Determine CPU support for SIMD on Power PC - * Copyright 2004 Phil Karn, KA9Q - */ -#include -#include "fec.h" -#ifdef __VEC__ -#include -#endif - -/* Various SIMD instruction set names */ -char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)", - "x86 Streaming SIMD Extensions (SSE)", - "x86 Streaming SIMD Extensions 2 (SSE2)", - "PowerPC G4/G5 Altivec/Velocity Engine"}; - -enum cpu_mode Cpu_mode; - -void find_cpu_mode(void){ - - if(Cpu_mode != UNKNOWN) - return; - -#ifdef __VEC__ - { - /* Ask the OS if we have Altivec support */ - int selectors[2] = { CTL_HW, HW_VECTORUNIT }; - int hasVectorUnit = 0; - size_t length = sizeof(hasVectorUnit); - int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); - if(0 == error && hasVectorUnit) - Cpu_mode = ALTIVEC; - else - Cpu_mode = PORT; - } -#else - Cpu_mode = PORT; -#endif - - fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]); -} diff --git a/libfec/cpu_mode_x86.c b/libfec/cpu_mode_x86.c deleted file mode 100644 index 322018e..0000000 --- a/libfec/cpu_mode_x86.c +++ /dev/null @@ -1,33 +0,0 @@ -/* Determine CPU support for SIMD - * Copyright 2004 Phil Karn, KA9Q - */ -#include -#include "fec.h" - -/* Various SIMD instruction set names */ -char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)", - "x86 Streaming SIMD Extensions (SSE)", - "x86 Streaming SIMD Extensions 2 (SSE2)", - "PowerPC G4/G5 Altivec/Velocity Engine"}; - -enum cpu_mode Cpu_mode; - -void find_cpu_mode(void){ - - int f; - if(Cpu_mode != UNKNOWN) - return; - - /* Figure out what kind of CPU we have */ - f = cpu_features(); - if(f & (1<<26)){ /* SSE2 is present */ - Cpu_mode = SSE2; - } else if(f & (1<<25)){ /* SSE is present */ - Cpu_mode = SSE; - } else if(f & (1<<23)){ /* MMX is present */ - Cpu_mode = MMX; - } else { /* No SIMD at all */ - Cpu_mode = PORT; - } - fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]); -} diff --git a/libfec/cpu_mode_x86_64.c b/libfec/cpu_mode_x86_64.c deleted file mode 100644 index 758096a..0000000 --- a/libfec/cpu_mode_x86_64.c +++ /dev/null @@ -1,27 +0,0 @@ -/* Determine CPU support for SIMD - * Copyright 2004 Phil Karn, KA9Q - * - * Modified in 2012 by Matthias P. Braendli, HB9EGM - */ -#include -#include "fec.h" - -/* Various SIMD instruction set names */ -char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)", - "x86 Streaming SIMD Extensions (SSE)", - "x86 Streaming SIMD Extensions 2 (SSE2)", - "PowerPC G4/G5 Altivec/Velocity Engine"}; - -enum cpu_mode Cpu_mode; - -void find_cpu_mode(void){ - - int f; - if(Cpu_mode != UNKNOWN) - return; - - /* According to the wikipedia entry x86-64, all x86-64 processors have SSE2 */ - /* The same assumption is also in other source files ! */ - Cpu_mode = SSE2; - fprintf(stderr,"CPU: x86-64, using portable C implementation\n"); -} diff --git a/libfec/encode_rs_av.c b/libfec/encode_rs_av.c deleted file mode 100644 index 32e528f..0000000 --- a/libfec/encode_rs_av.c +++ /dev/null @@ -1,61 +0,0 @@ -/* Fast Reed-Solomon encoder for (255,223) CCSDS code on PowerPC G4/G5 using Altivec instructions - * Copyright 2004, Phil Karn KA9Q - * May be used under the terms of the GNU Lesser General Public License (LGPL) - */ -#include -#include -#include "fixed.h" - -/* Lookup table for feedback multiplications - * These are the low half of the coefficients. Since the generator polynomial is - * palindromic, we form it by reversing these on the fly - */ -static union { vector unsigned char v; unsigned char c[16]; } table[256]; - -static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); -static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30); - -extern data_t CCSDS_alpha_to[]; -extern data_t CCSDS_index_of[]; -extern data_t CCSDS_poly[]; - -void rs_init_av(){ - int i,j; - - /* The PowerPC is big-endian, so the low-order byte of each vector contains the highest order term in the polynomial */ - for(j=0;j<16;j++){ - table[0].c[j] = 0; - for(i=1;i<256;i++){ - table[i].c[16-j-1] = CCSDS_alpha_to[MODNN(CCSDS_poly[j+1] + CCSDS_index_of[i])]; - } - } -#if 0 - for(i=0;i<256;i++){ - printf("table[%3d] = %3vu\n",i,table[i].v); - } -#endif -} - -void encode_rs_av(unsigned char *data,unsigned char *parity,int pad){ - union { vector unsigned char v[2]; unsigned char c[32]; } shift_register; - int i; - - shift_register.v[0] = (vector unsigned char)(0); - shift_register.v[1] = (vector unsigned char)(0); - - for(i=0;i -#include -#include - -#ifdef FIXED -#include "fixed.h" -#define EXERCISE exercise_8 -#elif defined(CCSDS) -#include "fixed.h" -#include "ccsds.h" -#define EXERCISE exercise_ccsds -#elif defined(BIGSYM) -#include "int.h" -#define EXERCISE exercise_int -#else -#include "char.h" -#define EXERCISE exercise_char -#endif - -#ifdef FIXED -#define PRINTPARM printf("(255,223):"); -#elif defined(CCSDS) -#define PRINTPARM printf("CCSDS (255,223):"); -#else -#define PRINTPARM printf("(%d,%d):",rs->nn,rs->nn-rs->nroots); -#endif - -/* Exercise the RS codec passed as an argument */ -int EXERCISE( -#if !defined(CCSDS) && !defined(FIXED) -void *p, -#endif -int trials){ -#if !defined(CCSDS) && !defined(FIXED) - struct rs *rs = (struct rs *)p; -#endif - data_t block[NN],tblock[NN]; - int i; - int errors; - int errlocs[NN]; - int derrlocs[NROOTS]; - int derrors; - int errval,errloc; - int erasures; - int decoder_errors = 0; - - while(trials-- != 0){ - /* Test up to the error correction capacity of the code */ - for(errors=0;errors <= NROOTS/2;errors++){ - - /* Load block with random data and encode */ - for(i=0;i -#include "fec.h" - -unsigned char Partab[256]; -int P_init; - -/* Create 256-entry odd-parity lookup table - * Needed only on non-ia32 machines - */ -void partab_init(void){ - int i,cnt,ti; - - /* Initialize parity lookup table */ - for(i=0;i<256;i++){ - cnt = 0; - ti = i; - while(ti){ - if(ti & 1) - cnt++; - ti >>= 1; - } - Partab[i] = cnt & 1; - } - P_init=1; -} - -/* Lookup table giving count of 1 bits for integers 0-255 */ -int Bitcnt[] = { - 0, 1, 1, 2, 1, 2, 2, 3, - 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, - 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, - 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, - 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, - 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, - 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, - 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, - 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, - 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, - 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, - 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, - 5, 6, 6, 7, 6, 7, 7, 8, -}; - diff --git a/libfec/fec.h b/libfec/fec.h deleted file mode 100644 index d6d4b08..0000000 --- a/libfec/fec.h +++ /dev/null @@ -1,355 +0,0 @@ -/* User include file for libfec - * Copyright 2004, Phil Karn, KA9Q - * May be used under the terms of the GNU Lesser General Public License (LGPL) - */ - -#ifndef _FEC_H_ -#define _FEC_H_ - -/* r=1/2 k=7 convolutional encoder polynomials - * The NASA-DSN convention is to use V27POLYA inverted, then V27POLYB - * The CCSDS/NASA-GSFC convention is to use V27POLYB, then V27POLYA inverted - */ -#define V27POLYA 0x6d -#define V27POLYB 0x4f - -void *create_viterbi27(int len); -void set_viterbi27_polynomial(int polys[2]); -int init_viterbi27(void *vp,int starting_state); -int update_viterbi27_blk(void *vp,unsigned char sym[],int npairs); -int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi27(void *vp); - -#ifdef __VEC__ -void *create_viterbi27_av(int len); -void set_viterbi27_polynomial_av(int polys[2]); -int init_viterbi27_av(void *p,int starting_state); -int chainback_viterbi27_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi27_av(void *p); -int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits); -#endif - -#ifdef __i386__ -void *create_viterbi27_mmx(int len); -void set_viterbi27_polynomial_mmx(int polys[2]); -int init_viterbi27_mmx(void *p,int starting_state); -int chainback_viterbi27_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi27_mmx(void *p); -int update_viterbi27_blk_mmx(void *p,unsigned char *syms,int nbits); - -void *create_viterbi27_sse(int len); -void set_viterbi27_polynomial_sse(int polys[2]); -int init_viterbi27_sse(void *p,int starting_state); -int chainback_viterbi27_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi27_sse(void *p); -int update_viterbi27_blk_sse(void *p,unsigned char *syms,int nbits); - -void *create_viterbi27_sse2(int len); -void set_viterbi27_polynomial_sse2(int polys[2]); -int init_viterbi27_sse2(void *p,int starting_state); -int chainback_viterbi27_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi27_sse2(void *p); -int update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits); -#endif - -void *create_viterbi27_port(int len); -void set_viterbi27_polynomial_port(int polys[2]); -int init_viterbi27_port(void *p,int starting_state); -int chainback_viterbi27_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi27_port(void *p); -int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits); - -/* r=1/2 k=9 convolutional encoder polynomials */ -#define V29POLYA 0x1af -#define V29POLYB 0x11d - -void *create_viterbi29(int len); -void set_viterbi29_polynomial(int polys[2]); -int init_viterbi29(void *vp,int starting_state); -int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits); -int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi29(void *vp); - -#ifdef __VEC__ -void *create_viterbi29_av(int len); -void set_viterbi29_polynomial_av(int polys[2]); -int init_viterbi29_av(void *p,int starting_state); -int chainback_viterbi29_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi29_av(void *p); -int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits); -#endif - -#ifdef __i386__ -void *create_viterbi29_mmx(int len); -void set_viterbi29_polynomial_mmx(int polys[2]); -int init_viterbi29_mmx(void *p,int starting_state); -int chainback_viterbi29_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi29_mmx(void *p); -int update_viterbi29_blk_mmx(void *p,unsigned char *syms,int nbits); - -void *create_viterbi29_sse(int len); -void set_viterbi29_polynomial_sse(int polys[2]); -int init_viterbi29_sse(void *p,int starting_state); -int chainback_viterbi29_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi29_sse(void *p); -int update_viterbi29_blk_sse(void *p,unsigned char *syms,int nbits); - -void *create_viterbi29_sse2(int len); -void set_viterbi29_polynomial_sse2(int polys[2]); -int init_viterbi29_sse2(void *p,int starting_state); -int chainback_viterbi29_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi29_sse2(void *p); -int update_viterbi29_blk_sse2(void *p,unsigned char *syms,int nbits); -#endif - -void *create_viterbi29_port(int len); -void set_viterbi29_polynomial_port(int polys[2]); -int init_viterbi29_port(void *p,int starting_state); -int chainback_viterbi29_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi29_port(void *p); -int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits); - -/* r=1/3 k=9 convolutional encoder polynomials */ -#define V39POLYA 0x1ed -#define V39POLYB 0x19b -#define V39POLYC 0x127 - -void *create_viterbi39(int len); -void set_viterbi39_polynomial(int polys[3]); -int init_viterbi39(void *vp,int starting_state); -int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits); -int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi39(void *vp); - -#ifdef __VEC__ -void *create_viterbi39_av(int len); -void set_viterbi39_polynomial_av(int polys[3]); -int init_viterbi39_av(void *p,int starting_state); -int chainback_viterbi39_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi39_av(void *p); -int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits); -#endif - -#ifdef __i386__ -void *create_viterbi39_mmx(int len); -void set_viterbi39_polynomial_mmx(int polys[3]); -int init_viterbi39_mmx(void *p,int starting_state); -int chainback_viterbi39_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi39_mmx(void *p); -int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits); - -void *create_viterbi39_sse(int len); -void set_viterbi39_polynomial_sse(int polys[3]); -int init_viterbi39_sse(void *p,int starting_state); -int chainback_viterbi39_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi39_sse(void *p); -int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits); - -void *create_viterbi39_sse2(int len); -void set_viterbi39_polynomial_sse2(int polys[3]); -int init_viterbi39_sse2(void *p,int starting_state); -int chainback_viterbi39_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi39_sse2(void *p); -int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits); -#endif - -void *create_viterbi39_port(int len); -void set_viterbi39_polynomial_port(int polys[3]); -int init_viterbi39_port(void *p,int starting_state); -int chainback_viterbi39_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi39_port(void *p); -int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits); - - -/* r=1/6 k=15 Cassini convolutional encoder polynomials without symbol inversion - * dfree = 56 - * These bits may be left-right flipped from some textbook representations; - * here I have the bits entering the shift register from the right (low) end - * - * Some other spacecraft use the same code, but with the polynomials in a different order. - * E.g., Mars Pathfinder and STEREO swap POLYC and POLYD. All use alternate symbol inversion, - * so use set_viterbi615_polynomial() as appropriate. - */ -#define V615POLYA 042631 -#define V615POLYB 047245 -#define V615POLYC 056507 -#define V615POLYD 073363 -#define V615POLYE 077267 -#define V615POLYF 064537 - -void *create_viterbi615(int len); -void set_viterbi615_polynomial(int polys[6]); -int init_viterbi615(void *vp,int starting_state); -int update_viterbi615_blk(void *vp,unsigned char *syms,int nbits); -int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi615(void *vp); - -#ifdef __VEC__ -void *create_viterbi615_av(int len); -void set_viterbi615_polynomial_av(int polys[6]); -int init_viterbi615_av(void *p,int starting_state); -int chainback_viterbi615_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi615_av(void *p); -int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits); -#endif - -#ifdef __i386__ -void *create_viterbi615_mmx(int len); -void set_viterbi615_polynomial_mmx(int polys[6]); -int init_viterbi615_mmx(void *p,int starting_state); -int chainback_viterbi615_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi615_mmx(void *p); -int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits); - -void *create_viterbi615_sse(int len); -void set_viterbi615_polynomial_sse(int polys[6]); -int init_viterbi615_sse(void *p,int starting_state); -int chainback_viterbi615_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi615_sse(void *p); -int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits); - -void *create_viterbi615_sse2(int len); -void set_viterbi615_polynomial_sse2(int polys[6]); -int init_viterbi615_sse2(void *p,int starting_state); -int chainback_viterbi615_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi615_sse2(void *p); -int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits); -#endif - -void *create_viterbi615_port(int len); -void set_viterbi615_polynomial_port(int polys[6]); -int init_viterbi615_port(void *p,int starting_state); -int chainback_viterbi615_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); -void delete_viterbi615_port(void *p); -int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits); - - -/* General purpose RS codec, 8-bit symbols */ -void encode_rs_char(void *rs,unsigned char *data,unsigned char *parity); -int decode_rs_char(void *rs,unsigned char *data,int *eras_pos, - int no_eras); -void *init_rs_char(int symsize,int gfpoly, - int fcr,int prim,int nroots, - int pad); -void free_rs_char(void *rs); - -/* General purpose RS codec, integer symbols */ -void encode_rs_int(void *rs,int *data,int *parity); -int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras); -void *init_rs_int(int symsize,int gfpoly,int fcr, - int prim,int nroots,int pad); -void free_rs_int(void *rs); - -/* CCSDS standard (255,223) RS codec with conventional (*not* dual-basis) - * symbol representation - */ -void encode_rs_8(unsigned char *data,unsigned char *parity,int pad); -int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras,int pad); - -/* CCSDS standard (255,223) RS codec with dual-basis symbol representation */ -void encode_rs_ccsds(unsigned char *data,unsigned char *parity,int pad); -int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras,int pad); - -/* Tables to map from conventional->dual (Taltab) and - * dual->conventional (Tal1tab) bases - */ -extern unsigned char Taltab[],Tal1tab[]; - - -/* CPU SIMD instruction set available */ -extern enum cpu_mode {UNKNOWN=0,PORT,MMX,SSE,SSE2,ALTIVEC} Cpu_mode; -void find_cpu_mode(void); /* Call this once at startup to set Cpu_mode */ - -/* Determine parity of argument: 1 = odd, 0 = even */ -#if defined(__i386__) || defined(__x86_64__) -static inline int parityb(unsigned char x){ - __asm__ __volatile__ ("test %1,%1;setpo %0" : "=q"(x) : "q" (x)); - return x; -} -#else -void partab_init(); - -static inline int parityb(unsigned char x){ - extern unsigned char Partab[256]; - extern int P_init; - if(!P_init){ - partab_init(); - } - return Partab[x]; -} -#endif - - -static inline int parity(int x){ - /* Fold down to one byte */ - x ^= (x >> 16); - x ^= (x >> 8); - return parityb(x); -} - -/* Useful utilities for simulation */ -double normal_rand(double mean, double std_dev); -unsigned char addnoise(int sym,double amp,double gain,double offset,int clip); - -extern int Bitcnt[]; - -/* Dot product functions */ -void *initdp(signed short coeffs[],int len); -void freedp(void *dp); -long dotprod(void *dp,signed short a[]); - -void *initdp_port(signed short coeffs[],int len); -void freedp_port(void *dp); -long dotprod_port(void *dp,signed short a[]); - -#ifdef __i386__ -void *initdp_mmx(signed short coeffs[],int len); -void freedp_mmx(void *dp); -long dotprod_mmx(void *dp,signed short a[]); - -void *initdp_sse(signed short coeffs[],int len); -void freedp_sse(void *dp); -long dotprod_sse(void *dp,signed short a[]); - -void *initdp_sse2(signed short coeffs[],int len); -void freedp_sse2(void *dp); -long dotprod_sse2(void *dp,signed short a[]); -#endif - -#ifdef __x86_64__ -void *initdp_sse2(signed short coeffs[],int len); -void freedp_sse2(void *dp); -long dotprod_sse2(void *dp,signed short a[]); -#endif - -#ifdef __VEC__ -void *initdp_av(signed short coeffs[],int len); -void freedp_av(void *dp); -long dotprod_av(void *dp,signed short a[]); -#endif - -/* Sum of squares - accepts signed shorts, produces unsigned long long */ -unsigned long long sumsq(signed short *in,int cnt); -unsigned long long sumsq_port(signed short *in,int cnt); - -#ifdef __i386__ -unsigned long long sumsq_mmx(signed short *in,int cnt); -unsigned long long sumsq_sse(signed short *in,int cnt); -unsigned long long sumsq_sse2(signed short *in,int cnt); -#endif -#ifdef __x86_64__ -unsigned long long sumsq_sse2(signed short *in,int cnt); -#endif -#ifdef __VEC__ -unsigned long long sumsq_av(signed short *in,int cnt); -#endif - - -/* Low-level data structures and routines */ - -int cpu_features(void); - -#endif /* _FEC_H_ */ - - - diff --git a/libfec/peakval.c b/libfec/peakval.c deleted file mode 100644 index 2105a44..0000000 --- a/libfec/peakval.c +++ /dev/null @@ -1,50 +0,0 @@ -/* Switch to appropriate version of peakval routine - * Copyright 2004, Phil Karn, KA9Q - */ - -#include -#include "fec.h" - -int peakval_port(signed short *b,int cnt); -#ifdef __i386__ -int peakval_mmx(signed short *b,int cnt); -int peakval_sse(signed short *b,int cnt); -int peakval_sse2(signed short *b,int cnt); -#endif - -#ifdef __x86_64__ -int peakval_sse2(signed short *b,int cnt); -#endif - -#ifdef __VEC__ -int peakval_av(signed short *b,int cnt); -#endif - -int peakval(signed short *b,int cnt){ - find_cpu_mode(); - - switch(Cpu_mode){ - case PORT: - default: - return peakval_port(b,cnt); -#ifdef __i386__ - case MMX: - return peakval_mmx(b,cnt); - case SSE: - return peakval_sse(b,cnt); - case SSE2: - return peakval_sse2(b,cnt); -#endif - -#ifdef __x86_64__ - case SSE2: - return peakval_port(b,cnt); - //return peakval_sse2(b,cnt); -#endif - -#ifdef __VEC__ - case ALTIVEC: - return peakval_av(b,cnt); -#endif - } -} diff --git a/libfec/peakval_av.c b/libfec/peakval_av.c deleted file mode 100644 index ae54c10..0000000 --- a/libfec/peakval_av.c +++ /dev/null @@ -1,61 +0,0 @@ -/* Return the largest absolute value of a vector of signed shorts - - * This is the Altivec SIMD version. - - * Copyright 2004 Phil Karn, KA9Q - * May be used under the terms of the GNU Lesser General Public License (LGPL) - */ - -#include "fec.h" - -signed short peakval_av(signed short *in,int cnt){ - vector signed short x; - int pad; - union { vector signed char cv; vector signed short hv; signed short s[8]; signed char c[16];} s; - vector signed short smallest,largest; - - smallest = (vector signed short)(0); - largest = (vector signed short)(0); - if((pad = (int)in & 15)!=0){ - /* Load unaligned leading word */ - x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in)); - if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */ - s.c[15] = (8-cnt)<<4; - x = vec_sro(x,s.cv); - } - smallest = vec_min(smallest,x); - largest = vec_max(largest,x); - in += 8-pad/2; - cnt -= 8-pad/2; - } - /* Everything is now aligned, rip through most of the block */ - while(cnt >= 8){ - x = vec_ld(0,in); - smallest = vec_min(smallest,x); - largest = vec_max(largest,x); - in += 8; - cnt -= 8; - } - /* Handle trailing fragment, if any */ - if(cnt > 0){ - x = vec_ld(0,in); - s.c[15] = (8-cnt)<<4; - x = vec_sro(x,s.cv); - smallest = vec_min(smallest,x); - largest = vec_max(largest,x); - } - /* Combine and extract result */ - largest = vec_max(largest,vec_abs(smallest)); - - s.c[15] = 64; /* Shift right four 16-bit words */ - largest = vec_max(largest,vec_sro(largest,s.cv)); - - s.c[15] = 32; /* Shift right two 16-bit words */ - largest = vec_max(largest,vec_sro(largest,s.cv)); - - s.c[15] = 16; /* Shift right one 16-bit word */ - largest = vec_max(largest,vec_sro(largest,s.cv)); - - s.hv = largest; - return s.s[7]; -} diff --git a/libfec/rs_speedtest.c b/libfec/rs_speedtest.c deleted file mode 100644 index 225f160..0000000 --- a/libfec/rs_speedtest.c +++ /dev/null @@ -1,54 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "fec.h" - -int main(){ - unsigned char block[255]; - int i; - void *rs; - struct rusage start,finish; - double extime; - int trials = 10000; - - for(i=0;i<223;i++) - block[i] = 0x01; - - rs = init_rs_char(8,0x187,112,11,32,0); - encode_rs_char(rs,block,&block[223]); - - getrusage(RUSAGE_SELF,&start); - for(i=0;i -#include -#include -#include -#include "fec.h" - - -struct etab { - int symsize; - int genpoly; - int fcs; - int prim; - int nroots; - int ntrials; -} Tab[] = { - {2, 0x7, 1, 1, 1, 10 }, - {3, 0xb, 1, 1, 2, 10 }, - {4, 0x13, 1, 1, 4, 10 }, - {5, 0x25, 1, 1, 6, 10 }, - {6, 0x43, 1, 1, 8, 10 }, - {7, 0x89, 1, 1, 10, 10 }, - {8, 0x11d, 1, 1, 32, 10 }, - {8, 0x187, 112,11, 32, 10 }, /* Duplicates CCSDS codec */ - {9, 0x211, 1, 1, 32, 10 }, - {10,0x409, 1, 1, 32, 10 }, - {11,0x805, 1, 1, 32, 10 }, - {12,0x1053, 1, 1, 32, 5 }, - {13,0x201b, 1, 1, 32, 2 }, - {14,0x4443, 1, 1, 32, 1 }, - {15,0x8003, 1, 1, 32, 1 }, - {16,0x1100b, 1, 1, 32, 1 }, - {0, 0, 0, 0, 0}, -}; - -int exercise_char(struct etab *e); -int exercise_int(struct etab *e); -int exercise_8(void); - -int main(){ - int i; - - srandom(time(NULL)); - - printf("Testing fixed CCSDS encoder...\n"); - exercise_8(); - for(i=0;Tab[i].symsize != 0;i++){ - int nn,kk; - - nn = (1<symsize) - 1; - unsigned char block[nn],tblock[nn]; - int errlocs[nn],derrlocs[nn]; - int i; - int errors; - int derrors,kk; - int errval,errloc; - int erasures; - int decoder_errors = 0; - void *rs; - - if(e->symsize > 8) - return -1; - - /* Compute code parameters */ - kk = nn - e->nroots; - - rs = init_rs_char(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0); - if(rs == NULL){ - printf("init_rs_char failed!\n"); - return -1; - } - /* Test up to the error correction capacity of the code */ - for(errors=0;errors <= e->nroots/2;errors++){ - - /* Load block with random data and encode */ - for(i=0;isymsize) - 1; - int block[nn],tblock[nn]; - int errlocs[nn],derrlocs[nn]; - int i; - int errors; - int derrors,kk; - int errval,errloc; - int erasures; - int decoder_errors = 0; - void *rs; - - /* Compute code parameters */ - kk = nn - e->nroots; - - rs = init_rs_int(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0); - if(rs == NULL){ - printf("init_rs_int failed!\n"); - return -1; - } - /* Test up to the error correction capacity of the code */ - for(errors=0;errors <= e->nroots/2;errors++){ - - /* Load block with random data and encode */ - for(i=0;i -#include -#include - -/* These values should trigger leading/trailing array fragment handling */ -#define NSAMP 200002 -#define OFFSET 1 - -long long sumsq_wq(signed short *in,int cnt); -long long sumsq_wq_ref(signed short *in,int cnt); - -int main(){ - int i; - long long result,rresult; - signed short samples[NSAMP]; - - srandom(time(NULL)); - - for(i=0;i -#include "fec.h" - -unsigned long long sumsq_port(signed short *,int); - -#ifdef __i386__ -unsigned long long sumsq_mmx(signed short *,int); -unsigned long long sumsq_sse(signed short *,int); -unsigned long long sumsq_sse2(signed short *,int); -#endif - -#ifdef __x86_64__ -unsigned long long sumsq_sse2(signed short *,int); -#endif - -#ifdef __VEC__ -unsigned long long sumsq_av(signed short *,int); -#endif - -unsigned long long sumsq(signed short *in,int cnt){ - switch(Cpu_mode){ - case PORT: - default: - return sumsq_port(in,cnt); -#ifdef __i386__ - case SSE: - case MMX: - return sumsq_mmx(in,cnt); - case SSE2: - return sumsq_sse2(in,cnt); -#endif - -#ifdef __x86_64__ - case SSE2: - return sumsq_port(in,cnt); - //return sumsq_sse2(in,cnt); -#endif - -#ifdef __VEC__ - case ALTIVEC: - return sumsq_av(in,cnt); -#endif - } -} diff --git a/libfec/sumsq_av.c b/libfec/sumsq_av.c deleted file mode 100644 index 53c6acf..0000000 --- a/libfec/sumsq_av.c +++ /dev/null @@ -1,78 +0,0 @@ -/* Compute the sum of the squares of a vector of signed shorts - - * This is the Altivec SIMD version. It's a little hairy because Altivec - * does not do 64-bit operations directly, so we have to accumulate separate - * 32-bit sums and carries - - * Copyright 2004 Phil Karn, KA9Q - * May be used under the terms of the GNU Lesser General Public License (LGPL) - */ - -#include "fec.h" - -unsigned long long sumsq_av(signed short *in,int cnt){ - long long sum; - vector signed short x; - vector unsigned int sums,carries,s1,s2; - int pad; - union { vector unsigned char cv; vector unsigned int iv; unsigned int w[4]; unsigned char c[16];} s; - - carries = sums = (vector unsigned int)(0); - if((pad = (int)in & 15)!=0){ - /* Load unaligned leading word */ - x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in)); - if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */ - s.c[15] = (8-cnt)<<4; - x = vec_sro(x,s.cv); - } - sums = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); - in += 8-pad/2; - cnt -= 8-pad/2; - } - /* Everything is now aligned, rip through most of the block */ - while(cnt >= 8){ - x = vec_ld(0,in); - /* A single vec_msum cannot overflow, but we have to sum it with - * the earlier terms separately to handle the carries - * The cast to unsigned is OK because squares are always positive - */ - s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); - carries = vec_add(carries,vec_addc(sums,s1)); - sums = vec_add(sums,s1); - in += 8; - cnt -= 8; - } - /* Handle trailing fragment, if any */ - if(cnt > 0){ - x = vec_ld(0,in); - s.c[15] = (8-cnt)<<4; - x = vec_sro(x,s.cv); - s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); - carries = vec_add(carries,vec_addc(sums,s1)); - sums = vec_add(sums,s1); - } - /* Combine 4 sub-sums and carries */ - s.c[15] = 64; /* Shift right two 32-bit words */ - s1 = vec_sro(sums,s.cv); - s2 = vec_sro(carries,s.cv); - carries = vec_add(carries,vec_addc(sums,s1)); - sums = vec_add(sums,s1); - carries = vec_add(carries,s2); - - s.c[15] = 32; /* Shift right one 32-bit word */ - s1 = vec_sro(sums,s.cv); - s2 = vec_sro(carries,s.cv); - carries = vec_add(carries,vec_addc(sums,s1)); - sums = vec_add(sums,s1); - carries = vec_add(carries,s2); - - /* Extract sum and carries from right-hand words and combine into result */ - s.iv = sums; - sum = s.w[3]; - - s.iv = carries; - sum += (long long)s.w[3] << 32; - - return sum; -} - diff --git a/libfec/sumsq_test.c b/libfec/sumsq_test.c deleted file mode 100644 index 4debd47..0000000 --- a/libfec/sumsq_test.c +++ /dev/null @@ -1,101 +0,0 @@ -#include -#include -#include -#include -#include "config.h" -#ifdef HAVE_GETOPT_H -#include -#endif -#include "fec.h" - -#if HAVE_GETOPT_LONG -struct option Options[] = { - {"frame-length",1,NULL,'l'}, - {"frame-count",1,NULL,'n'}, - {"verbose",0,NULL,'v'}, - {"force-altivec",0,NULL,'a'}, - {"force-port",0,NULL,'p'}, - {"force-mmx",0,NULL,'m'}, - {"force-sse",0,NULL,'s'}, - {"force-sse2",0,NULL,'t'}, - {NULL}, -}; -#endif - -int Verbose = 0; - -int main(int argc,char *argv[]){ - signed short *buf; - int i,d,trial,trials=10000; - int bufsize = 2048; - long long port_sum,simd_sum; - time_t t; - int timetrials=0; - - find_cpu_mode(); - time(&t); - srandom(t); - -#if HAVE_GETOPT_LONG - while((d = getopt_long(argc,argv,"vapmstl:n:T",Options,NULL)) != EOF){ -#else - while((d = getopt(argc,argv,"vapmstl:n:T")) != EOF){ -#endif - switch(d){ - case 'a': - Cpu_mode = ALTIVEC; - break; - case 'p': - Cpu_mode = PORT; - break; - case 'm': - Cpu_mode = MMX; - break; - case 's': - Cpu_mode = SSE; - break; - case 't': - Cpu_mode = SSE2; - break; - case 'l': - bufsize = atoi(optarg); - break; - case 'n': - trials = atoi(optarg); - break; - case 'v': - Verbose++; - break; - case 'T': - timetrials++; - break; - } - } - - buf = (signed short *)calloc(bufsize,sizeof(signed short)); - if(timetrials){ - for(trial=0;trial -#include -#include -#include "fec.h" - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi27(int len){ - find_cpu_mode(); - - switch(Cpu_mode){ - case PORT: - default: - return create_viterbi27_port(len); -#ifdef __VEC__ - case ALTIVEC: - return create_viterbi27_av(len); -#endif -#ifdef __i386__ - case MMX: - return create_viterbi27_mmx(len); - case SSE: - return create_viterbi27_sse(len); - case SSE2: - return create_viterbi27_sse2(len); -#endif -#ifdef __x86_64__ - case SSE2: - return create_viterbi27_port(len); -#endif - } -} - -void set_viterbi27_polynomial(int polys[2]){ - switch(Cpu_mode){ - case PORT: - default: - set_viterbi27_polynomial_port(polys); - break; -#ifdef __VEC__ - case ALTIVEC: - set_viterbi27_polynomial_av(polys); - break; -#endif -#ifdef __i386__ - case MMX: - set_viterbi27_polynomial_mmx(polys); - break; - case SSE: - set_viterbi27_polynomial_sse(polys); - break; - case SSE2: - set_viterbi27_polynomial_sse2(polys); - break; -#endif -#ifdef __x86_64__ - case SSE2: - set_viterbi27_polynomial_port(polys); - break; -#endif - } -} - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi27(void *p,int starting_state){ - switch(Cpu_mode){ - case PORT: - default: - return init_viterbi27_port(p,starting_state); -#ifdef __VEC__ - case ALTIVEC: - return init_viterbi27_av(p,starting_state); -#endif -#ifdef __i386__ - case MMX: - return init_viterbi27_mmx(p,starting_state); - case SSE: - return init_viterbi27_sse(p,starting_state); - case SSE2: - return init_viterbi27_sse2(p,starting_state); -#endif -#ifdef __x86_64__ - case SSE2: - return init_viterbi27_port(p,starting_state); -#endif - } -} - -/* Viterbi chainback */ -int chainback_viterbi27( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - - switch(Cpu_mode){ - case PORT: - default: - return chainback_viterbi27_port(p,data,nbits,endstate); -#ifdef __VEC__ - case ALTIVEC: - return chainback_viterbi27_av(p,data,nbits,endstate); -#endif -#ifdef __i386__ - case MMX: - return chainback_viterbi27_mmx(p,data,nbits,endstate); - case SSE: - return chainback_viterbi27_sse(p,data,nbits,endstate); - case SSE2: - return chainback_viterbi27_sse2(p,data,nbits,endstate); -#endif -#ifdef __x86_64__ - case SSE2: - return chainback_viterbi27_port(p,data,nbits,endstate); -#endif - } -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi27(void *p){ - switch(Cpu_mode){ - case PORT: - default: - delete_viterbi27_port(p); - break; -#ifdef __VEC__ - case ALTIVEC: - delete_viterbi27_av(p); - break; -#endif -#ifdef __i386__ - case MMX: - delete_viterbi27_mmx(p); - break; - case SSE: - delete_viterbi27_sse(p); - break; - case SSE2: - delete_viterbi27_sse2(p); - break; -#endif -#ifdef __x86_64__ - case SSE2: - delete_viterbi27_port(p); - break; -#endif - } -} - -/* Update decoder with a block of demodulated symbols - * Note that nbits is the number of decoded data bits, not the number - * of symbols! - */ -int update_viterbi27_blk(void *p,unsigned char syms[],int nbits){ - if(p == NULL) - return -1; - - switch(Cpu_mode){ - case PORT: - default: - update_viterbi27_blk_port(p,syms,nbits); - break; -#ifdef __VEC__ - case ALTIVEC: - update_viterbi27_blk_av(p,syms,nbits); - break; -#endif -#ifdef __i386__ - case MMX: - update_viterbi27_blk_mmx(p,syms,nbits); - break; - case SSE: - update_viterbi27_blk_sse(p,syms,nbits); - break; - case SSE2: - update_viterbi27_blk_sse2(p,syms,nbits); - break; -#endif -#ifdef __x86_64__ - case SSE2: - update_viterbi27_blk_port(p,syms,nbits); - break; -#endif - } - return 0; -} diff --git a/libfec/viterbi27_av.c b/libfec/viterbi27_av.c deleted file mode 100644 index 98d7344..0000000 --- a/libfec/viterbi27_av.c +++ /dev/null @@ -1,210 +0,0 @@ -/* K=7 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec instructions - * Feb 2004, Phil Karn, KA9Q - */ -#include -#include -#include -#include "fec.h" - -typedef union { long long p; unsigned char c[64]; vector bool char v[4]; } decision_t; -typedef union { long long p; unsigned char c[64]; vector unsigned char v[4]; } metric_t; - -static union branchtab27 { unsigned char c[32]; vector unsigned char v[2];} Branchtab27[2]; -static int Init = 0; - -/* State info for instance of Viterbi decoder - * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s! - */ -struct v27 { - metric_t metrics1; /* path metric buffer 1 */ - metric_t metrics2; /* path metric buffer 2 */ - decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ - decision_t *decisions; /* Beginning of decisions for block */ -}; - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi27_av(void *p,int starting_state){ - struct v27 *vp = p; - int i; - - if(p == NULL) - return -1; - for(i=0;i<4;i++) - vp->metrics1.v[i] = (vector unsigned char)(63); - vp->old_metrics = &vp->metrics1; - vp->new_metrics = &vp->metrics2; - vp->dp = vp->decisions; - vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ - return 0; -} - -void set_viterbi27_polynomial_av(int polys[2]){ - int state; - - for(state=0;state < 32;state++){ - Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; - Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; - } - Init++; -} - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi27_av(int len){ - struct v27 *vp; - - if(!Init){ - int polys[2] = { V27POLYA,V27POLYB }; - set_viterbi27_polynomial_av(polys); - } - if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL) - return NULL; - if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){ - free(vp); - return NULL; - } - init_viterbi27_av(vp,0); - return vp; -} - -/* Viterbi chainback */ -int chainback_viterbi27_av( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - struct v27 *vp = p; - decision_t *d = (decision_t *)vp->decisions; - - if(p == NULL) - return -1; - - /* Make room beyond the end of the encoder register so we can - * accumulate a full byte of decoded data - */ - endstate %= 64; - endstate <<= 2; - - /* The store into data[] only needs to be done every 8 bits. - * But this avoids a conditional branch, and the writes will - * combine in the cache anyway - */ - d += 6; /* Look past tail */ - while(nbits-- != 0){ - int k; - - k = d[nbits].c[endstate>>2] & 1; - data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); - } - return 0; -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi27_av(void *p){ - struct v27 *vp = p; - - if(vp != NULL){ - free(vp->decisions); - free(vp); - } -} - -/* Process received symbols */ -int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits){ - struct v27 *vp = p; - decision_t *d; - - if(p == NULL) - return -1; - d = (decision_t *)vp->dp; - while(nbits--){ - vector unsigned char survivor0,survivor1,sym0v,sym1v; - vector bool char decision0,decision1; - vector unsigned char metric,m_metric,m0,m1,m2,m3; - void *tmp; - - /* sym0v.0 = syms[0]; sym0v.1 = syms[1] */ - sym0v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms)); - - sym1v = vec_splat(sym0v,1); /* Splat syms[1] across sym1v */ - sym0v = vec_splat(sym0v,0); /* Splat syms[0] across sym0v */ - syms += 2; - - /* Do the 32 butterflies as two interleaved groups of 16 each to keep the pipes full */ - - /* Form first set of 16 branch metrics */ - metric = vec_avg(vec_xor(Branchtab27[0].v[0],sym0v),vec_xor(Branchtab27[1].v[0],sym1v)); - metric = vec_sr(metric,(vector unsigned char)(3)); - m_metric = vec_sub((vector unsigned char)(31),metric); - - /* Form first set of path metrics */ - m0 = vec_adds(vp->old_metrics->v[0],metric); - m3 = vec_adds(vp->old_metrics->v[2],metric); - m1 = vec_adds(vp->old_metrics->v[2],m_metric); - m2 = vec_adds(vp->old_metrics->v[0],m_metric); - - /* Form second set of 16 branch metrics */ - metric = vec_avg(vec_xor(Branchtab27[0].v[1],sym0v),vec_xor(Branchtab27[1].v[1],sym1v)); - metric = vec_sr(metric,(vector unsigned char)(3)); - m_metric = vec_sub((vector unsigned char)(31),metric); - - /* Compare and select first set */ - decision0 = vec_cmpgt(m0,m1); - decision1 = vec_cmpgt(m2,m3); - survivor0 = vec_min(m0,m1); - survivor1 = vec_min(m2,m3); - - /* Compute second set of path metrics */ - m0 = vec_adds(vp->old_metrics->v[1],metric); - m3 = vec_adds(vp->old_metrics->v[3],metric); - m1 = vec_adds(vp->old_metrics->v[3],m_metric); - m2 = vec_adds(vp->old_metrics->v[1],m_metric); - - /* Interleave and store first decisions and survivors */ - d->v[0] = vec_mergeh(decision0,decision1); - d->v[1] = vec_mergel(decision0,decision1); - vp->new_metrics->v[0] = vec_mergeh(survivor0,survivor1); - vp->new_metrics->v[1] = vec_mergel(survivor0,survivor1); - - /* Compare and select second set */ - decision0 = vec_cmpgt(m0,m1); - decision1 = vec_cmpgt(m2,m3); - survivor0 = vec_min(m0,m1); - survivor1 = vec_min(m2,m3); - - /* Interleave and store second set of decisions and survivors */ - d->v[2] = vec_mergeh(decision0,decision1); - d->v[3] = vec_mergel(decision0,decision1); - vp->new_metrics->v[2] = vec_mergeh(survivor0,survivor1); - vp->new_metrics->v[3] = vec_mergel(survivor0,survivor1); - - /* renormalize if necessary */ - if(vp->new_metrics->c[0] >= 105){ - vector unsigned char scale0,scale1; - - /* Find smallest metric and splat */ - scale0 = vec_min(vp->new_metrics->v[0],vp->new_metrics->v[1]); - scale1 = vec_min(vp->new_metrics->v[2],vp->new_metrics->v[3]); - scale0 = vec_min(scale0,scale1); - scale0 = vec_min(scale0,vec_sld(scale0,scale0,8)); - scale0 = vec_min(scale0,vec_sld(scale0,scale0,4)); - scale0 = vec_min(scale0,vec_sld(scale0,scale0,2)); - scale0 = vec_min(scale0,vec_sld(scale0,scale0,1)); - - /* Now subtract from all metrics */ - vp->new_metrics->v[0] = vec_subs(vp->new_metrics->v[0],scale0); - vp->new_metrics->v[1] = vec_subs(vp->new_metrics->v[1],scale0); - vp->new_metrics->v[2] = vec_subs(vp->new_metrics->v[2],scale0); - vp->new_metrics->v[3] = vec_subs(vp->new_metrics->v[3],scale0); - } - d++; - /* Swap pointers to old and new metrics */ - tmp = vp->old_metrics; - vp->old_metrics = vp->new_metrics; - vp->new_metrics = tmp; - } - vp->dp = d; - - return 0; -} - diff --git a/libfec/viterbi27_sse2.c b/libfec/viterbi27_sse2.c deleted file mode 100644 index bc01710..0000000 --- a/libfec/viterbi27_sse2.c +++ /dev/null @@ -1,180 +0,0 @@ -/* K=7 r=1/2 Viterbi decoder for SSE2 - * Feb 2004, Phil Karn, KA9Q - */ -#include -#include -#include -#include -#include "fec.h" - -typedef union { unsigned char c[64]; __m128i v[4]; } metric_t; -typedef union { unsigned long w[2]; unsigned char c[8]; unsigned short s[4]; __m64 v[1];} decision_t; -union branchtab27 { unsigned char c[32]; __m128i v[2];} Branchtab27_sse2[2]; -static int Init = 0; - -/* State info for instance of Viterbi decoder - * Don't change this without also changing references in sse2bfly27.s! - */ -struct v27 { - metric_t metrics1; /* path metric buffer 1 */ - metric_t metrics2; /* path metric buffer 2 */ - decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ - decision_t *decisions; /* Beginning of decisions for block */ -}; - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi27_sse2(void *p,int starting_state){ - struct v27 *vp = p; - int i; - - if(p == NULL) - return -1; - for(i=0;i<64;i++) - vp->metrics1.c[i] = 63; - - vp->old_metrics = &vp->metrics1; - vp->new_metrics = &vp->metrics2; - vp->dp = vp->decisions; - vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ - return 0; -} - -void set_viterbi27_polynomial_sse2(int polys[2]){ - int state; - - for(state=0;state < 32;state++){ - Branchtab27_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; - Branchtab27_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; - } - Init++; -} - - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi27_sse2(int len){ - void *p; - struct v27 *vp; - - if(!Init){ - int polys[2] = { V27POLYA, V27POLYB }; - set_viterbi27_polynomial_sse2(polys); - } - /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ - if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v27))) - return NULL; - vp = (struct v27 *)p; - - if((p = malloc((len+6)*sizeof(decision_t))) == NULL){ - free(vp); - return NULL; - } - vp->decisions = (decision_t *)p; - init_viterbi27_sse2(vp,0); - - return vp; -} - -/* Viterbi chainback */ -int chainback_viterbi27_sse2( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - struct v27 *vp = p; - decision_t *d; - - if(p == NULL) - return -1; - d = vp->decisions; - /* Make room beyond the end of the encoder register so we can - * accumulate a full byte of decoded data - */ - endstate %= 64; - endstate <<= 2; - - /* The store into data[] only needs to be done every 8 bits. - * But this avoids a conditional branch, and the writes will - * combine in the cache anyway - */ - d += 6; /* Look past tail */ - while(nbits-- != 0){ - int k; - - k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1; - data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); - } - return 0; -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi27_sse2(void *p){ - struct v27 *vp = p; - - if(vp != NULL){ - free(vp->decisions); - free(vp); - } -} - - -#if 0 -/* This code is turned off because it's slower than my hand-crafted assembler in sse2bfly27.s. But it does work. */ -void update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits){ - struct v27 *vp = p; - decision_t *d; - - if(p == NULL) - return; - d = (decision_t *)vp->dp; - while(nbits--){ - __m128i sym0v,sym1v; - void *tmp; - int i; - - /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ - sym0v = _mm_set1_epi8(syms[0]); - sym1v = _mm_set1_epi8(syms[1]); - syms += 2; - - for(i=0;i<2;i++){ - __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; - - /* Form branch metrics */ - metric = _mm_avg_epu8(_mm_xor_si128(Branchtab27_sse2[0].v[i],sym0v),_mm_xor_si128(Branchtab27_sse2[1].v[i],sym1v)); - /* There's no packed bytes right shift in SSE2, so we use the word version and mask - * (I'm *really* starting to like Altivec...) - */ - metric = _mm_srli_epi16(metric,3); - metric = _mm_and_si128(metric,_mm_set1_epi8(31)); - m_metric = _mm_sub_epi8(_mm_set1_epi8(31),metric); - - /* Add branch metrics to path metrics */ - m0 = _mm_add_epi8(vp->old_metrics->v[i],metric); - m3 = _mm_add_epi8(vp->old_metrics->v[2+i],metric); - m1 = _mm_add_epi8(vp->old_metrics->v[2+i],m_metric); - m2 = _mm_add_epi8(vp->old_metrics->v[i],m_metric); - - /* Compare and select, using modulo arithmetic */ - decision0 = _mm_cmpgt_epi8(_mm_sub_epi8(m0,m1),_mm_setzero_si128()); - decision1 = _mm_cmpgt_epi8(_mm_sub_epi8(m2,m3),_mm_setzero_si128()); - survivor0 = _mm_or_si128(_mm_and_si128(decision0,m1),_mm_andnot_si128(decision0,m0)); - survivor1 = _mm_or_si128(_mm_and_si128(decision1,m3),_mm_andnot_si128(decision1,m2)); - - /* Pack each set of decisions into 16 bits */ - d->s[2*i] = _mm_movemask_epi8(_mm_unpacklo_epi8(decision0,decision1)); - d->s[2*i+1] = _mm_movemask_epi8(_mm_unpackhi_epi8(decision0,decision1)); - - /* Store surviving metrics */ - vp->new_metrics->v[2*i] = _mm_unpacklo_epi8(survivor0,survivor1); - vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi8(survivor0,survivor1); - } - d++; - /* Swap pointers to old and new metrics */ - tmp = vp->old_metrics; - vp->old_metrics = vp->new_metrics; - vp->new_metrics = tmp; - } - vp->dp = d; -} -#endif diff --git a/libfec/viterbi29.c b/libfec/viterbi29.c deleted file mode 100644 index f51e356..0000000 --- a/libfec/viterbi29.c +++ /dev/null @@ -1,178 +0,0 @@ -/* Switch to K=9 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD - * Copyright Feb 2004, Phil Karn, KA9Q - */ -#include -#include -#include -#include "fec.h" - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi29(int len){ - find_cpu_mode(); - - switch(Cpu_mode){ - case PORT: - default: - return create_viterbi29_port(len); -#ifdef __VEC__ - case ALTIVEC: - return create_viterbi29_av(len); -#endif -#ifdef __i386__ - case MMX: - return create_viterbi29_mmx(len); - case SSE: - return create_viterbi29_sse(len); - case SSE2: - return create_viterbi29_sse2(len); -#endif -#ifdef __x86_64__ - case SSE2: - return create_viterbi29_port(len); -#endif - } -} - -void set_viterbi29_polynomial(int polys[2]){ - switch(Cpu_mode){ - case PORT: - default: - set_viterbi29_polynomial_port(polys); - break; -#ifdef __VEC__ - case ALTIVEC: - set_viterbi29_polynomial_av(polys); - break; -#endif -#ifdef __i386__ - case MMX: - set_viterbi29_polynomial_mmx(polys); - break; - case SSE: - set_viterbi29_polynomial_sse(polys); - break; - case SSE2: - set_viterbi29_polynomial_sse2(polys); - break; -#endif -#ifdef __x86_64__ - case SSE2: - set_viterbi29_polynomial_port(polys); - break; -#endif - } -} - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi29(void *p,int starting_state){ - switch(Cpu_mode){ - case PORT: - default: - return init_viterbi29_port(p,starting_state); -#ifdef __VEC__ - case ALTIVEC: - return init_viterbi29_av(p,starting_state); -#endif -#ifdef __i386__ - case MMX: - return init_viterbi29_mmx(p,starting_state); - case SSE: - return init_viterbi29_sse(p,starting_state); - case SSE2: - return init_viterbi29_sse2(p,starting_state); -#endif -#ifdef __x86_64__ - case SSE2: - return init_viterbi29_port(p,starting_state); -#endif - } -} - -/* Viterbi chainback */ -int chainback_viterbi29( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - - switch(Cpu_mode){ - case PORT: - default: - return chainback_viterbi29_port(p,data,nbits,endstate); -#ifdef __VEC__ - case ALTIVEC: - return chainback_viterbi29_av(p,data,nbits,endstate); -#endif -#ifdef __i386__ - case MMX: - return chainback_viterbi29_mmx(p,data,nbits,endstate); - case SSE: - return chainback_viterbi29_sse(p,data,nbits,endstate); - case SSE2: - return chainback_viterbi29_sse2(p,data,nbits,endstate); -#endif -#ifdef __x86_64__ - case SSE2: - return chainback_viterbi29_port(p,data,nbits,endstate); -#endif - } -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi29(void *p){ - switch(Cpu_mode){ - case PORT: - default: - delete_viterbi29_port(p); - break; -#ifdef __VEC__ - case ALTIVEC: - delete_viterbi29_av(p); - break; -#endif -#ifdef __i386__ - case MMX: - delete_viterbi29_mmx(p); - break; - case SSE: - delete_viterbi29_sse(p); - break; - case SSE2: - delete_viterbi29_sse2(p); - break; -#endif -#ifdef __x86_64__ - case SSE2: - delete_viterbi29_port(p); - break; -#endif - } -} - -/* Update decoder with a block of demodulated symbols - * Note that nbits is the number of decoded data bits, not the number - * of symbols! - */ -int update_viterbi29_blk(void *p,unsigned char syms[],int nbits){ - switch(Cpu_mode){ - case PORT: - default: - return update_viterbi29_blk_port(p,syms,nbits); -#ifdef __VEC__ - case ALTIVEC: - return update_viterbi29_blk_av(p,syms,nbits); -#endif -#ifdef __i386__ - case MMX: - return update_viterbi29_blk_mmx(p,syms,nbits); - case SSE: - return update_viterbi29_blk_sse(p,syms,nbits); - case SSE2: - return update_viterbi29_blk_sse2(p,syms,nbits); -#endif -#ifdef __x86_64__ - case SSE2: - return update_viterbi29_blk_port(p,syms,nbits); -#endif - } -} diff --git a/libfec/viterbi29_av.c b/libfec/viterbi29_av.c deleted file mode 100644 index 31c8d27..0000000 --- a/libfec/viterbi29_av.c +++ /dev/null @@ -1,190 +0,0 @@ -/* K=9 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec - * Copyright Feb 2004, Phil Karn, KA9Q - * May be used under the terms of the GNU Lesser General Public License (LGPL) - */ -#include -#include -#include -#include -#include "fec.h" - -typedef union { unsigned char c[256]; vector bool char v[16]; } decision_t; -typedef union { unsigned char c[256]; vector unsigned char v[16]; } metric_t; - -static union branchtab29 { unsigned char c[128]; vector unsigned char v[8]; } Branchtab29[2]; -static int Init = 0; - -/* State info for instance of Viterbi decoder */ -struct v29 { - metric_t metrics1; /* path metric buffer 1 */ - metric_t metrics2; /* path metric buffer 2 */ - decision_t *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ - decision_t *decisions; /* Beginning of decisions for block */ -}; - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi29_av(void *p,int starting_state){ - struct v29 *vp = p; - int i; - - if(p == NULL) - return -1; - for(i=0;i<16;i++) - vp->metrics1.v[i] = (vector unsigned char)(63); - - vp->old_metrics = &vp->metrics1; - vp->new_metrics = &vp->metrics2; - vp->dp = vp->decisions; - vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */ - return 0; -} - -void set_viterbi29_polynomial_av(int polys[2]){ - int state; - - for(state=0;state < 128;state++){ - Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; - Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; - } - Init++; -} - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi29_av(int len){ - struct v29 *vp; - - if(!Init){ - int polys[2] = { V29POLYA,V29POLYB }; - set_viterbi29_polynomial_av(polys); - } - if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) - return NULL; - if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ - free(vp); - return NULL; - } - init_viterbi29_av(vp,0); - return vp; -} - -/* Viterbi chainback */ -int chainback_viterbi29_av( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - struct v29 *vp = p; - decision_t *d; - - if(p == NULL) - return -1; - d = (decision_t *)vp->decisions; - /* Make room beyond the end of the encoder register so we can - * accumulate a full byte of decoded data - */ - endstate %= 256; - - /* The store into data[] only needs to be done every 8 bits. - * But this avoids a conditional branch, and the writes will - * combine in the cache anyway - */ - d += 8; /* Look past tail */ - while(nbits-- != 0){ - int k; - - k = d[nbits].c[endstate] & 1; - data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); - } - return 0; -} - - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi29_av(void *p){ - struct v29 *vp = p; - - if(vp != NULL){ - free(vp->decisions); - free(vp); - } -} - - -int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits){ - struct v29 *vp = p; - decision_t *d; - int i; - - if(p == NULL) - return -1; - d = (decision_t *)vp->dp; - - while(nbits--){ - vector unsigned char sym1v,sym2v; - void *tmp; - - /* All this seems necessary just to load a byte into all elements of a vector! */ - sym1v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms)); /* sym1v.0 = syms[0]; sym1v.1 = syms[1] */ - sym2v = vec_splat(sym1v,1); /* Splat syms[1] across sym2v */ - sym1v = vec_splat(sym1v,0); /* Splat syms[0] across sym1v */ - syms += 2; - - for(i=0;i<8;i++){ - vector bool char decision0,decision1; - vector unsigned char metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; - - /* Form branch metrics */ - metric = vec_avg(vec_xor(Branchtab29[0].v[i],sym1v),vec_xor(Branchtab29[1].v[i],sym2v)); - metric = vec_sr(metric,(vector unsigned char)(3)); - m_metric = (vector unsigned char)(31) - metric; - - /* Add branch metrics to path metrics */ - m0 = vec_adds(vp->old_metrics->v[i],metric); - m3 = vec_adds(vp->old_metrics->v[8+i],metric); - m1 = vec_adds(vp->old_metrics->v[8+i],m_metric); - m2 = vec_adds(vp->old_metrics->v[i],m_metric); - - /* Compare and select first set */ - decision0 = vec_cmpgt(m0,m1); - decision1 = vec_cmpgt(m2,m3); - survivor0 = vec_min(m0,m1); - survivor1 = vec_min(m2,m3); - - /* Interleave and store decisions and survivors */ - d->v[2*i] = vec_mergeh(decision0,decision1); - d->v[2*i+1] = vec_mergel(decision0,decision1); - vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); - vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); - } - d++; - /* renormalize if necessary */ - if(vp->new_metrics->c[0] >= 50){ - int i; - vector unsigned char scale0,scale1; - - /* Find smallest metric and splat */ - scale0 = vp->new_metrics->v[0]; - scale1 = vp->new_metrics->v[1]; - for(i=2;i<16;i+=2){ - scale0 = vec_min(scale0,vp->new_metrics->v[i]); - scale1 = vec_min(scale1,vp->new_metrics->v[i+1]); - } - scale0 = vec_min(scale0,scale1); - scale0 = vec_min(scale0,vec_sld(scale0,scale0,8)); - scale0 = vec_min(scale0,vec_sld(scale0,scale0,4)); - scale0 = vec_min(scale0,vec_sld(scale0,scale0,2)); - scale0 = vec_min(scale0,vec_sld(scale0,scale0,1)); - - /* Now subtract from all metrics */ - for(i=0;i<16;i++) - vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale0); - } - /* Swap pointers to old and new metrics */ - tmp = vp->old_metrics; - vp->old_metrics = vp->new_metrics; - vp->new_metrics = tmp; - } - vp->dp = d; - return 0; -} diff --git a/libfec/viterbi39.c b/libfec/viterbi39.c deleted file mode 100644 index d2e65f4..0000000 --- a/libfec/viterbi39.c +++ /dev/null @@ -1,179 +0,0 @@ -/* Switch to K=9 r=1/3 Viterbi decoder with optional Intel or PowerPC SIMD - * Copyright Aug 2006, Phil Karn, KA9Q - */ -#include -#include -#include -#include "fec.h" - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi39(int len){ - find_cpu_mode(); - - switch(Cpu_mode){ - case PORT: - default: - return create_viterbi39_port(len); -#ifdef __VEC__ - case ALTIVEC: - return create_viterbi39_av(len); -#endif -#ifdef __i386__ - case MMX: - return create_viterbi39_mmx(len); - case SSE: - return create_viterbi39_sse(len); - case SSE2: - return create_viterbi39_sse2(len); -#endif -#ifdef __x86_64__ - case SSE2: - return create_viterbi39_port(len); -#endif - } -} - -void set_viterbi39_polynomial(int polys[3]){ - switch(Cpu_mode){ - case PORT: - default: - set_viterbi39_polynomial_port(polys); - break; -#ifdef __VEC__ - case ALTIVEC: - set_viterbi39_polynomial_av(polys); - break; -#endif -#ifdef __i386__ - case MMX: - set_viterbi39_polynomial_mmx(polys); - break; - case SSE: - set_viterbi39_polynomial_sse(polys); - break; - case SSE2: - set_viterbi39_polynomial_sse2(polys); - break; -#endif -#ifdef __x86_64__ - case SSE2: - set_viterbi39_polynomial_port(polys); - break; -#endif - } -} - - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi39(void *p,int starting_state){ - switch(Cpu_mode){ - case PORT: - default: - return init_viterbi39_port(p,starting_state); -#ifdef __VEC__ - case ALTIVEC: - return init_viterbi39_av(p,starting_state); -#endif -#ifdef __i386__ - case MMX: - return init_viterbi39_mmx(p,starting_state); - case SSE: - return init_viterbi39_sse(p,starting_state); - case SSE2: - return init_viterbi39_sse2(p,starting_state); -#endif -#ifdef __x86_64__ - case SSE2: - return init_viterbi39_port(p,starting_state); -#endif - } -} - -/* Viterbi chainback */ -int chainback_viterbi39( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - - switch(Cpu_mode){ - case PORT: - default: - return chainback_viterbi39_port(p,data,nbits,endstate); -#ifdef __VEC__ - case ALTIVEC: - return chainback_viterbi39_av(p,data,nbits,endstate); -#endif -#ifdef __i386__ - case MMX: - return chainback_viterbi39_mmx(p,data,nbits,endstate); - case SSE: - return chainback_viterbi39_sse(p,data,nbits,endstate); - case SSE2: - return chainback_viterbi39_sse2(p,data,nbits,endstate); -#endif -#ifdef __x86_64__ - case SSE2: - return chainback_viterbi39_port(p,data,nbits,endstate); -#endif - } -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi39(void *p){ - switch(Cpu_mode){ - case PORT: - default: - delete_viterbi39_port(p); - break; -#ifdef __VEC__ - case ALTIVEC: - delete_viterbi39_av(p); - break; -#endif -#ifdef __i386__ - case MMX: - delete_viterbi39_mmx(p); - break; - case SSE: - delete_viterbi39_sse(p); - break; - case SSE2: - delete_viterbi39_sse2(p); - break; -#endif -#ifdef __x86_64__ - case SSE2: - delete_viterbi39_port(p); - break; -#endif - } -} - -/* Update decoder with a block of demodulated symbols - * Note that nbits is the number of decoded data bits, not the number - * of symbols! - */ -int update_viterbi39_blk(void *p,unsigned char syms[],int nbits){ - switch(Cpu_mode){ - case PORT: - default: - return update_viterbi39_blk_port(p,syms,nbits); -#ifdef __VEC__ - case ALTIVEC: - return update_viterbi39_blk_av(p,syms,nbits); -#endif -#ifdef __i386__ - case MMX: - return update_viterbi39_blk_mmx(p,syms,nbits); - case SSE: - return update_viterbi39_blk_sse(p,syms,nbits); - case SSE2: - return update_viterbi39_blk_sse2(p,syms,nbits); -#endif -#ifdef __x86_64__ - case SSE2: - return update_viterbi39_blk_port(p,syms,nbits); -#endif - } -} diff --git a/libfec/viterbi39_mmx.c b/libfec/viterbi39_mmx.c deleted file mode 100644 index 875391a..0000000 --- a/libfec/viterbi39_mmx.c +++ /dev/null @@ -1,185 +0,0 @@ -/* K=9 r=1/3 Viterbi decoder for x86 MMX - * Aug 2006, Phil Karn, KA9Q - * May be used under the terms of the GNU Lesser General Public License (LGPL) - */ -#include -#include -#include -#include -#include "fec.h" - -typedef union { unsigned char c[256]; __m64 v[32];} decision_t; -typedef union { unsigned short s[256]; __m64 v[64];} metric_t; - -static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3]; -static int Init = 0; - -/* State info for instance of Viterbi decoder */ -struct v39 { - metric_t metrics1; /* path metric buffer 1 */ - metric_t metrics2; /* path metric buffer 2 */ - void *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ - void *decisions; /* Beginning of decisions for block */ -}; - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi39_mmx(void *p,int starting_state){ - struct v39 *vp = p; - int i; - - if(p == NULL) - return -1; - for(i=0;i<256;i++) - vp->metrics1.s[i] = 1000; - - vp->old_metrics = &vp->metrics1; - vp->new_metrics = &vp->metrics2; - vp->dp = vp->decisions; - vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */ - return 0; -} - -void set_viterbi39_polynomial_mmx(int polys[3]){ - int state; - - for(state=0;state < 128;state++){ - Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; - Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; - Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; - } - Init++; -} - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi39_mmx(int len){ - struct v39 *vp; - - if(!Init){ - int polys[3] = { V39POLYA,V39POLYB,V39POLYC }; - set_viterbi39_polynomial_mmx(polys); - } - if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL) - return NULL; - if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){ - free(vp); - return NULL; - } - init_viterbi39_mmx(vp,0); - return vp; -} - - - -/* Viterbi chainback */ -int chainback_viterbi39_mmx( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - struct v39 *vp = p; - decision_t *d; - int path_metric; - - if(p == NULL) - return -1; - - d = (decision_t *)vp->decisions; - - endstate %= 256; - - path_metric = vp->old_metrics->s[endstate]; - - /* The store into data[] only needs to be done every 8 bits. - * But this avoids a conditional branch, and the writes will - * combine in the cache anyway - */ - d += 8; /* Look past tail */ - while(nbits-- != 0){ - int k; - - k = d[nbits].c[endstate] & 1; - endstate = (k << 7) | (endstate >> 1); - data[nbits>>3] = endstate; - } - return path_metric; -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi39_mmx(void *p){ - struct v39 *vp = p; - - if(vp != NULL){ - free(vp->decisions); - free(vp); - } -} - - -int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits){ - struct v39 *vp = p; - decision_t *d; - int path_metric = 0; - - if(p == NULL) - return -1; - - d = (decision_t *)vp->dp; - - while(nbits--){ - __m64 sym0v,sym1v,sym2v; - void *tmp; - int i; - - /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ - sym0v = _mm_set1_pi16(syms[0]); - sym1v = _mm_set1_pi16(syms[1]); - sym2v = _mm_set1_pi16(syms[2]); - syms += 3; - - for(i=0;i<32;i++){ - __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; - - /* Form branch metrics - * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, - * the XOR operations constitute conditional negation. - * metric and m_metric (-metric) are in the range 0-1530 - */ - m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v)); - metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0); - m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric); - - /* Add branch metrics to path metrics */ - m0 = _mm_add_pi16(vp->old_metrics->v[i],metric); - m3 = _mm_add_pi16(vp->old_metrics->v[32+i],metric); - m1 = _mm_add_pi16(vp->old_metrics->v[32+i],m_metric); - m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric); - - /* Compare and select - * There's no packed min instruction in MMX, so we use modulo arithmetic - * to form the decisions and then do the select the hard way - */ - decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64()); - decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64()); - survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0)); - survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2)); - - /* Merge decisions and store as bytes */ - d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())); - - /* Store surviving metrics */ - vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); - vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); - } - if(vp->new_metrics->s[0] < vp->old_metrics->s[0]) - path_metric += 65536; /* Hack: wraparound probably occured */ - d++; - /* Swap pointers to old and new metrics */ - tmp = vp->old_metrics; - vp->old_metrics = vp->new_metrics; - vp->new_metrics = tmp; - } - vp->dp = d; - _mm_empty(); - return path_metric; -} diff --git a/libfec/viterbi39_sse.c b/libfec/viterbi39_sse.c deleted file mode 100644 index c2f2865..0000000 --- a/libfec/viterbi39_sse.c +++ /dev/null @@ -1,201 +0,0 @@ -/* K=9 r=1/3 Viterbi decoder for x86 SSE - * Copyright Aug 2006, Phil Karn, KA9Q - * May be used under the terms of the GNU Lesser General Public License (LGPL) - */ -#include -#include -#include -#include -#include -#include "fec.h" - -typedef union { unsigned long w[8]; unsigned char c[32];} decision_t; -typedef union { signed short s[256]; __m64 v[64];} metric_t; - -static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3]; -static int Init = 0; - -/* State info for instance of Viterbi decoder */ -struct v39 { - metric_t metrics1; /* path metric buffer 1 */ - metric_t metrics2; /* path metric buffer 2 */ - void *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ - void *decisions; /* Beginning of decisions for block */ -}; - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi39_sse(void *p,int starting_state){ - struct v39 *vp = p; - int i; - - if(p == NULL) - return -1; - for(i=0;i<256;i++) - vp->metrics1.s[i] = (SHRT_MIN+1000); - - vp->old_metrics = &vp->metrics1; - vp->new_metrics = &vp->metrics2; - vp->dp = vp->decisions; - vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */ - return 0; -} - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi39_sse(int len){ - struct v39 *vp; - - if(!Init){ - int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; - - set_viterbi39_polynomial_sse(polys); - } - if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL){ - return NULL; - } - if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){ - free(vp); - return NULL; - } - init_viterbi39_sse(vp,0); - return vp; -} - -void set_viterbi39_polynomial_sse(int polys[3]){ - int state; - - for(state=0;state < 128;state++){ - Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; - Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; - Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; - } - Init++; -} - -/* Viterbi chainback */ -int chainback_viterbi39_sse( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - struct v39 *vp = p; - decision_t *d; - int path_metric; - - if(p == NULL) - return -1; - d = (decision_t *)vp->decisions; - endstate %= 256; - - path_metric = vp->old_metrics->s[endstate]; - - /* The store into data[] only needs to be done every 8 bits. - * But this avoids a conditional branch, and the writes will - * combine in the cache anyway - */ - d += 8; /* Look past tail */ - while(nbits-- != 0){ - int k; - - /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/ - k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; - endstate = (k << 7) | (endstate >> 1); - data[nbits>>3] = endstate; - } - return path_metric - SHRT_MIN; -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi39_sse(void *p){ - struct v39 *vp = p; - - if(vp != NULL){ - free(vp->decisions); - free(vp); - } -} - - -int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits){ - struct v39 *vp = p; - decision_t *d; - int path_metric = 0; - - if(p == NULL) - return -1; - d = (decision_t *)vp->dp; - while(nbits--){ - __m64 sym0v,sym1v,sym2v; - void *tmp; - int i; - - /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ - sym0v = _mm_set1_pi16(syms[0]); - sym1v = _mm_set1_pi16(syms[1]); - sym2v = _mm_set1_pi16(syms[2]); - syms += 3; - - for(i=0;i<32;i++){ - __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; - - /* Form branch metrics - * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, - * the XOR operations constitute conditional negation. - * metric and m_metric (-metric) are in the range 0-765 - */ - m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v)); - metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0); - m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric); - - /* Add branch metrics to path metrics */ - m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric); - m3 = _mm_adds_pi16(vp->old_metrics->v[32+i],metric); - m1 = _mm_adds_pi16(vp->old_metrics->v[32+i],m_metric); - m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric); - - /* Compare and select */ - survivor0 = _mm_min_pi16(m0,m1); - survivor1 = _mm_min_pi16(m2,m3); - decision0 = _mm_cmpeq_pi16(survivor0,m1); - decision1 = _mm_cmpeq_pi16(survivor1,m3); - - /* Pack decisions into 8 bits and store */ - d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()))); - - /* Store surviving metrics */ - vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); - vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); - } - /* See if we need to renormalize - * Max metric spread for this code with 0-255 branch metrics is 12750 - */ - if(vp->new_metrics->s[0] >= SHRT_MAX-5000){ - int i,adjust; - __m64 adjustv; - union { __m64 v; signed short w[4]; } t; - - /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ - adjustv = vp->new_metrics->v[0]; - for(i=1;i<64;i++) - adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]); - - adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32)); - adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16)); - t.v = adjustv; - adjust = t.w[0] - SHRT_MIN; - path_metric += adjust; - adjustv = _mm_set1_pi16(adjust); - - for(i=0;i<64;i++) - vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv); - } - d++; - /* Swap pointers to old and new metrics */ - tmp = vp->old_metrics; - vp->old_metrics = vp->new_metrics; - vp->new_metrics = tmp; - } - vp->dp = d; - _mm_empty(); - return path_metric; -} diff --git a/libfec/viterbi615.c b/libfec/viterbi615.c deleted file mode 100644 index ec2fb3c..0000000 --- a/libfec/viterbi615.c +++ /dev/null @@ -1,181 +0,0 @@ -/* K=15 r=1/6 Viterbi decoder with optional Intel or PowerPC SIMD - * Copyright Feb 2004, Phil Karn, KA9Q - */ -#include -#include -#include -#include "fec.h" - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi615(int len){ - - find_cpu_mode(); - - switch(Cpu_mode){ - case PORT: - default: - return create_viterbi615_port(len); -#ifdef __VEC__ - case ALTIVEC: - return create_viterbi615_av(len); -#endif -#ifdef __i386__ - case MMX: - return create_viterbi615_mmx(len); - case SSE: - return create_viterbi615_sse(len); - case SSE2: - return create_viterbi615_sse2(len); -#endif -#ifdef __x86_64__ - case SSE2: - return create_viterbi615_port(len); -#endif - } -} - -void set_viterbi615_polynomial(int polys[6]){ - - switch(Cpu_mode){ - case PORT: - default: - set_viterbi615_polynomial_port(polys); - break; -#ifdef __VEC__ - case ALTIVEC: - set_viterbi615_polynomial_av(polys); - break; -#endif -#ifdef __i386__ - case MMX: - set_viterbi615_polynomial_mmx(polys); - break; - case SSE: - set_viterbi615_polynomial_sse(polys); - break; - case SSE2: - set_viterbi615_polynomial_sse2(polys); - break; -#endif -#ifdef __x86_64__ - case SSE2: - set_viterbi615_polynomial_port(polys); - break; -#endif - } -} - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi615(void *p,int starting_state){ - switch(Cpu_mode){ - case PORT: - default: - return init_viterbi615_port(p,starting_state); -#ifdef __VEC__ - case ALTIVEC: - return init_viterbi615_av(p,starting_state); -#endif -#ifdef __i386__ - case MMX: - return init_viterbi615_mmx(p,starting_state); - case SSE: - return init_viterbi615_sse(p,starting_state); - case SSE2: - return init_viterbi615_sse2(p,starting_state); -#endif -#ifdef __x86_64__ - case SSE2: - return init_viterbi615_port(p,starting_state); -#endif - } -} - -/* Viterbi chainback */ -int chainback_viterbi615( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - - switch(Cpu_mode){ - case PORT: - default: - return chainback_viterbi615_port(p,data,nbits,endstate); -#ifdef __VEC__ - case ALTIVEC: - return chainback_viterbi615_av(p,data,nbits,endstate); -#endif -#ifdef __i386__ - case MMX: - return chainback_viterbi615_mmx(p,data,nbits,endstate); - case SSE: - return chainback_viterbi615_sse(p,data,nbits,endstate); - case SSE2: - return chainback_viterbi615_sse2(p,data,nbits,endstate); -#endif -#ifdef __x86_64__ - case SSE2: - return chainback_viterbi615_port(p,data,nbits,endstate); -#endif - } -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi615(void *p){ - switch(Cpu_mode){ - case PORT: - default: - delete_viterbi615_port(p); - break; -#ifdef __VEC__ - case ALTIVEC: - delete_viterbi615_av(p); - break; -#endif -#ifdef __i386__ - case MMX: - delete_viterbi615_mmx(p); - break; - case SSE: - delete_viterbi615_sse(p); - break; - case SSE2: - delete_viterbi615_sse2(p); - break; -#endif -#ifdef __x86_64__ - case SSE2: - delete_viterbi615_port(p); - break; -#endif - } -} - -/* Update decoder with a block of demodulated symbols - * Note that nbits is the number of decoded data bits, not the number - * of symbols! - */ -int update_viterbi615_blk(void *p,unsigned char syms[],int nbits){ - switch(Cpu_mode){ - case PORT: - default: - return update_viterbi615_blk_port(p,syms,nbits); -#ifdef __VEC__ - case ALTIVEC: - return update_viterbi615_blk_av(p,syms,nbits); -#endif -#ifdef __i386__ - case MMX: - return update_viterbi615_blk_mmx(p,syms,nbits); - case SSE: - return update_viterbi615_blk_sse(p,syms,nbits); - case SSE2: - return update_viterbi615_blk_sse2(p,syms,nbits); -#endif -#ifdef __x86_64__ - case SSE2: - return update_viterbi615_blk_port(p,syms,nbits); -#endif - } -} - diff --git a/libfec/viterbi615_av.c b/libfec/viterbi615_av.c deleted file mode 100644 index 4a6ce9c..0000000 --- a/libfec/viterbi615_av.c +++ /dev/null @@ -1,257 +0,0 @@ -/* K=15 r=1/6 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions - * 8-bit offset-binary soft decision samples - * Copyright Mar 2004, Phil Karn, KA9Q - * May be used under the terms of the GNU Lesser General Public License (LGPL) - */ -#include -#include -#include -#include -#include "fec.h" - -typedef union { unsigned char c[128][16]; vector unsigned char v[128]; } decision_t; -typedef union { unsigned short s[16384]; vector unsigned short v[2048]; } metric_t; - -static union branchtab615 { unsigned short s[8192]; vector unsigned short v[1024];} Branchtab615[6]; -static int Init = 0; - -/* State info for instance of Viterbi decoder */ -struct v615 { - metric_t metrics1; /* path metric buffer 1 */ - metric_t metrics2; /* path metric buffer 2 */ - void *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ - void *decisions; /* Beginning of decisions for block */ -}; - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi615_av(void *p,int starting_state){ - struct v615 *vp = p; - int i; - - if(p == NULL) - return -1; - - for(i=0;i<2048;i++) - vp->metrics1.v[i] = (vector unsigned short)(5000); - - vp->old_metrics = &vp->metrics1; - vp->new_metrics = &vp->metrics2; - vp->dp = vp->decisions; - vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */ - return 0; -} - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi615_av(int len){ - struct v615 *vp; - - if(!Init){ - int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; - set_viterbi615_polynomial_av(polys); - } - vp = (struct v615 *)malloc(sizeof(struct v615)); - vp->decisions = malloc(sizeof(decision_t)*(len+14)); - init_viterbi615_av(vp,0); - return vp; -} - -void set_viterbi615_polynomial_av(int polys[6]){ - int state; - int i; - - for(state=0;state < 8192;state++){ - for(i=0;i<6;i++) - Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; - } - Init++; -} - - -/* Viterbi chainback */ -int chainback_viterbi615_av( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - struct v615 *vp = p; - decision_t *d = (decision_t *)vp->decisions; - int path_metric; - - endstate %= 16384; - - path_metric = vp->old_metrics->s[endstate]; - - /* The store into data[] only needs to be done every 8 bits. - * But this avoids a conditional branch, and the writes will - * combine in the cache anyway - */ - d += 14; /* Look past tail */ - while(nbits-- != 0){ - int k; - - k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0; - endstate = (k << 13) | (endstate >> 1); - data[nbits>>3] = endstate >> 6; - } - return path_metric; -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi615_av(void *p){ - struct v615 *vp = p; - - if(vp != NULL){ - free(vp->decisions); - free(vp); - } -} - -int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits){ - struct v615 *vp = p; - decision_t *d = (decision_t *)vp->dp; - int path_metric = 0; - vector unsigned char decisions = (vector unsigned char)(0); - - while(nbits--){ - vector unsigned short symv,sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; - vector unsigned char s; - void *tmp; - int i; - - /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ - s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms)); - - symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s); /* Unsigned byte->word unpack */ - sym0v = vec_splat(symv,0); - sym1v = vec_splat(symv,1); - sym2v = vec_splat(symv,2); - sym3v = vec_splat(symv,3); - sym4v = vec_splat(symv,4); - sym5v = vec_splat(symv,5); - syms += 6; - - for(i=0;i<1024;i++){ - vector bool short decision0,decision1; - vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; - - /* Form branch metrics - * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, - * the XOR operations constitute conditional negation. - * metric and m_metric (-metric) are in the range 0-1530 - */ - m0 = vec_add(vec_xor(Branchtab615[0].v[i],sym0v),vec_xor(Branchtab615[1].v[i],sym1v)); - m1 = vec_add(vec_xor(Branchtab615[2].v[i],sym2v),vec_xor(Branchtab615[3].v[i],sym3v)); - m2 = vec_add(vec_xor(Branchtab615[4].v[i],sym4v),vec_xor(Branchtab615[5].v[i],sym5v)); - metric = vec_add(m0,m1); - metric = vec_add(metric,m2); - m_metric = vec_sub((vector unsigned short)(1530),metric); - - /* Add branch metrics to path metrics */ - m0 = vec_adds(vp->old_metrics->v[i],metric); - m3 = vec_adds(vp->old_metrics->v[1024+i],metric); - m1 = vec_adds(vp->old_metrics->v[1024+i],m_metric); - m2 = vec_adds(vp->old_metrics->v[i],m_metric); - - /* Compare and select */ - decision0 = vec_cmpgt(m0,m1); - decision1 = vec_cmpgt(m2,m3); - survivor0 = vec_min(m0,m1); - survivor1 = vec_min(m2,m3); - - /* Store decisions and survivors. - * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in - * a funny interleaved fashion that we undo in the chainback function. - */ - decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */ - - /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting - * 0xff is equivalent to adding 1, which sets the lsb. - */ - decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1))); - - vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); - vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); - - if((i % 8) == 7){ - /* We've accumulated a total of 128 decisions, stash and start again */ - d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */ - } - } -#if 0 - /* Experimentally determine metric spread - * The results are fixed for a given code and input symbol size - */ - { - int i; - vector unsigned short min_metric; - vector unsigned short max_metric; - union { vector unsigned short v; unsigned short s[8];} t; - int minimum,maximum; - static int max_spread = 0; - - min_metric = max_metric = vp->new_metrics->v[0]; - for(i=1;i<2048;i++){ - min_metric = vec_min(min_metric,vp->new_metrics->v[i]); - max_metric = vec_max(max_metric,vp->new_metrics->v[i]); - } - min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8)); - max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8)); - min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4)); - max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4)); - min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2)); - max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2)); - - t.v = min_metric; - minimum = t.s[0]; - t.v = max_metric; - maximum = t.s[0]; - if(maximum-minimum > max_spread){ - max_spread = maximum-minimum; - printf("metric spread = %d\n",max_spread); - } - } -#endif - - /* Renormalize if necessary. This deserves some explanation. - - * The maximum possible spread, found by experiment, for 4-bit symbols is 405; for 8 bit symbols, it's 12750. - * So by looking at one arbitrary metric we can tell if any of them have possibly saturated. - * However, this is very conservative. Large spreads occur only at very high Eb/No, where - * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor. - - * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric - * by not not normalizing when we should are extremely low. So either way, the risk to performance is small. - - * All this is borne out by experiment. - */ - if(vp->new_metrics->s[0] >= USHRT_MAX-12750){ - vector unsigned short scale; - union { vector unsigned short v; unsigned short s[8];} t; - - /* Find smallest metric and splat */ - scale = vp->new_metrics->v[0]; - for(i=1;i<2048;i++) - scale = vec_min(scale,vp->new_metrics->v[i]); - - scale = vec_min(scale,vec_sld(scale,scale,8)); - scale = vec_min(scale,vec_sld(scale,scale,4)); - scale = vec_min(scale,vec_sld(scale,scale,2)); - - /* Subtract it from all metrics - * Work backwards to try to improve the cache hit ratio, assuming LRU - */ - for(i=2047;i>=0;i--) - vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale); - t.v = scale; - path_metric += t.s[0]; - } - d++; - /* Swap pointers to old and new metrics */ - tmp = vp->old_metrics; - vp->old_metrics = vp->new_metrics; - vp->new_metrics = tmp; - } - vp->dp = d; - return path_metric; -} diff --git a/libfec/viterbi615_mmx.c b/libfec/viterbi615_mmx.c deleted file mode 100644 index 89a56f7..0000000 --- a/libfec/viterbi615_mmx.c +++ /dev/null @@ -1,183 +0,0 @@ -/* K=15 r=1/6 Viterbi decoder for x86 MMX - * Mar 2004, Phil Karn, KA9Q - * May be used under the terms of the GNU Lesser General Public License (LGPL) - */ -#include -#include -#include -#include -#include "fec.h" - -typedef union { unsigned char c[16384]; __m64 v[2048];} decision_t; -typedef union { unsigned short s[16384]; __m64 v[4096];} metric_t; - -static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6]; -static int Init = 0; - -/* State info for instance of Viterbi decoder */ -struct v615 { - metric_t metrics1; /* path metric buffer 1 */ - metric_t metrics2; /* path metric buffer 2 */ - void *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ - void *decisions; /* Beginning of decisions for block */ -}; - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi615_mmx(void *p,int starting_state){ - struct v615 *vp = p; - int i; - - if(p == NULL) - return -1; - for(i=0;i<16384;i++) - vp->metrics1.s[i] = 5000; - - vp->old_metrics = &vp->metrics1; - vp->new_metrics = &vp->metrics2; - vp->dp = vp->decisions; - vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */ - return 0; -} - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi615_mmx(int len){ - struct v615 *vp; - - if(!Init){ - int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; - set_viterbi615_polynomial_mmx(polys); - } - - if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL) - return NULL; - if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ - free(vp); - return NULL; - } - init_viterbi615_mmx(vp,0); - return vp; -} - -void set_viterbi615_polynomial_mmx(int polys[6]){ - int state; - int i; - - for(state=0;state < 8192;state++){ - for(i=0;i<6;i++) - Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; - } - Init++; -} - -/* Viterbi chainback */ -int chainback_viterbi615_mmx( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - struct v615 *vp = p; - decision_t *d; - - if(p == NULL) - return -1; - - d = (decision_t *)vp->decisions; - - endstate %= 16384; - - /* The store into data[] only needs to be done every 8 bits. - * But this avoids a conditional branch, and the writes will - * combine in the cache anyway - */ - d += 14; /* Look past tail */ - while(nbits-- != 0){ - int k; - - k = d[nbits].c[endstate] & 1; - endstate = (k << 13) | (endstate >> 1); - data[nbits>>3] = endstate >> 6; - } - return 0; -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi615_mmx(void *p){ - struct v615 *vp = p; - - if(vp != NULL){ - free(vp->decisions); - free(vp); - } -} - - -int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits){ - struct v615 *vp = p; - decision_t *d; - - if(p == NULL) - return -1; - - d = (decision_t *)vp->dp; - - while(nbits--){ - __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; - void *tmp; - int i; - - /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ - sym0v = _mm_set1_pi16(syms[0]); - sym1v = _mm_set1_pi16(syms[1]); - sym2v = _mm_set1_pi16(syms[2]); - sym3v = _mm_set1_pi16(syms[3]); - sym4v = _mm_set1_pi16(syms[4]); - sym5v = _mm_set1_pi16(syms[5]); - syms += 6; - - for(i=0;i<2048;i++){ - __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; - - /* Form branch metrics - * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, - * the XOR operations constitute conditional negation. - * metric and m_metric (-metric) are in the range 0-1530 - */ - m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v)); - m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v)); - m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v)); - metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2)); - m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric); - - /* Add branch metrics to path metrics */ - m0 = _mm_add_pi16(vp->old_metrics->v[i],metric); - m3 = _mm_add_pi16(vp->old_metrics->v[2048+i],metric); - m1 = _mm_add_pi16(vp->old_metrics->v[2048+i],m_metric); - m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric); - - /* Compare and select - * There's no packed min instruction in MMX, so we use modulo arithmetic - * to form the decisions and then do the select the hard way - */ - decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64()); - decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64()); - survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0)); - survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2)); - - /* Merge decisions and store as bytes */ - d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())); - - /* Store surviving metrics */ - vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); - vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); - } - d++; - /* Swap pointers to old and new metrics */ - tmp = vp->old_metrics; - vp->old_metrics = vp->new_metrics; - vp->new_metrics = tmp; - } - vp->dp = d; - _mm_empty(); - return 0; -} diff --git a/libfec/viterbi615_sse.c b/libfec/viterbi615_sse.c deleted file mode 100644 index de0f8af..0000000 --- a/libfec/viterbi615_sse.c +++ /dev/null @@ -1,201 +0,0 @@ -/* K=15 r=1/6 Viterbi decoder for x86 SSE - * Copyright Mar 2004, Phil Karn, KA9Q - * May be used under the terms of the GNU Lesser General Public License (LGPL) - */ -#include -#include -#include -#include -#include -#include "fec.h" - -typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t; -typedef union { signed short s[16384]; __m64 v[4096];} metric_t; - -static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6]; -static int Init = 0; - -/* State info for instance of Viterbi decoder */ -struct v615 { - metric_t metrics1; /* path metric buffer 1 */ - metric_t metrics2; /* path metric buffer 2 */ - void *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ - void *decisions; /* Beginning of decisions for block */ -}; - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi615_sse(void *p,int starting_state){ - struct v615 *vp = p; - int i; - - if(p == NULL) - return -1; - for(i=0;i<16384;i++) - vp->metrics1.s[i] = (SHRT_MIN+5000); - - vp->old_metrics = &vp->metrics1; - vp->new_metrics = &vp->metrics2; - vp->dp = vp->decisions; - vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */ - return 0; -} - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi615_sse(int len){ - struct v615 *vp; - - if(!Init){ - int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; - set_viterbi615_polynomial_sse(polys); - } - - if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL){ - return NULL; - } - if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ - free(vp); - return NULL; - } - init_viterbi615_sse(vp,0); - return vp; -} - -void set_viterbi615_polynomial_sse(int polys[6]){ - int state; - int i; - - for(state=0;state < 8192;state++){ - for(i=0;i<6;i++) - Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; - } - Init++; -} - -/* Viterbi chainback */ -int chainback_viterbi615_sse( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - struct v615 *vp = p; - decision_t *d; - - if(p == NULL) - return -1; - d = (decision_t *)vp->decisions; - endstate %= 16384; - - /* The store into data[] only needs to be done every 8 bits. - * But this avoids a conditional branch, and the writes will - * combine in the cache anyway - */ - d += 14; /* Look past tail */ - while(nbits-- != 0){ - int k; - - /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/ - k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; - endstate = (k << 13) | (endstate >> 1); - data[nbits>>3] = endstate >> 6; - } - return 0; -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi615_sse(void *p){ - struct v615 *vp = p; - - if(vp != NULL){ - free(vp->decisions); - free(vp); - } -} - - -int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits){ - struct v615 *vp = p; - decision_t *d; - - if(p == NULL) - return -1; - d = (decision_t *)vp->dp; - while(nbits--){ - __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; - void *tmp; - int i; - - /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ - sym0v = _mm_set1_pi16(syms[0]); - sym1v = _mm_set1_pi16(syms[1]); - sym2v = _mm_set1_pi16(syms[2]); - sym3v = _mm_set1_pi16(syms[3]); - sym4v = _mm_set1_pi16(syms[4]); - sym5v = _mm_set1_pi16(syms[5]); - syms += 6; - - for(i=0;i<2048;i++){ - __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; - - /* Form branch metrics - * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, - * the XOR operations constitute conditional negation. - * metric and m_metric (-metric) are in the range 0-1530 - */ - m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v)); - m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v)); - m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v)); - metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2)); - m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric); - - /* Add branch metrics to path metrics */ - m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric); - m3 = _mm_adds_pi16(vp->old_metrics->v[2048+i],metric); - m1 = _mm_adds_pi16(vp->old_metrics->v[2048+i],m_metric); - m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric); - - /* Compare and select */ - survivor0 = _mm_min_pi16(m0,m1); - survivor1 = _mm_min_pi16(m2,m3); - decision0 = _mm_cmpeq_pi16(survivor0,m1); - decision1 = _mm_cmpeq_pi16(survivor1,m3); - - /* Pack decisions into 8 bits and store */ - d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()))); - - /* Store surviving metrics */ - vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); - vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); - } - /* See if we need to renormalize - * Max metric spread for this code with 0-255 branch metrics is 12750 - */ - if(vp->new_metrics->s[0] >= SHRT_MAX-12750){ - int i,adjust; - __m64 adjustv; - union { __m64 v; signed short w[4]; } t; - - /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ - adjustv = vp->new_metrics->v[0]; - for(i=1;i<4096;i++) - adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]); - - adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32)); - adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16)); - t.v = adjustv; - adjust = t.w[0] - SHRT_MIN; - adjustv = _mm_set1_pi16(adjust); - - for(i=0;i<4096;i++) - vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv); - } - d++; - /* Swap pointers to old and new metrics */ - tmp = vp->old_metrics; - vp->old_metrics = vp->new_metrics; - vp->new_metrics = tmp; - } - vp->dp = d; - _mm_empty(); - return 0; -} diff --git a/libfec/viterbi615_sse2.c b/libfec/viterbi615_sse2.c deleted file mode 100644 index 7f711e5..0000000 --- a/libfec/viterbi615_sse2.c +++ /dev/null @@ -1,204 +0,0 @@ -/* K=15 r=1/6 Viterbi decoder for x86 SSE2 - * Copyright Mar 2004, Phil Karn, KA9Q - * May be used under the terms of the GNU Lesser General Public License (LGPL) - */ -#include -#include -#include -#include -#include -#include "fec.h" - -typedef union { unsigned long w[512]; unsigned short s[1024];} decision_t; -typedef union { signed short s[16384]; __m128i v[2048];} metric_t; - -static union branchtab615 { unsigned short s[8192]; __m128i v[1024];} Branchtab615[6]; -static int Init = 0; - -/* State info for instance of Viterbi decoder */ -struct v615 { - metric_t metrics1; /* path metric buffer 1 */ - metric_t metrics2; /* path metric buffer 2 */ - void *dp; /* Pointer to current decision */ - metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ - void *decisions; /* Beginning of decisions for block */ -}; - -/* Initialize Viterbi decoder for start of new frame */ -int init_viterbi615_sse2(void *p,int starting_state){ - struct v615 *vp = p; - int i; - - if(p == NULL) - return -1; - for(i=0;i<16384;i++) - vp->metrics1.s[i] = (SHRT_MIN+5000); - - vp->old_metrics = &vp->metrics1; - vp->new_metrics = &vp->metrics2; - vp->dp = vp->decisions; - vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */ - return 0; -} - -/* Create a new instance of a Viterbi decoder */ -void *create_viterbi615_sse2(int len){ - void *p; - struct v615 *vp; - - if(!Init){ - int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; - set_viterbi615_polynomial_sse2(polys); - } - - /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ - if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v615))) - return NULL; - - vp = (struct v615 *)p; - if((p = malloc((len+14)*sizeof(decision_t))) == NULL){ - free(vp); - return NULL; - } - vp->decisions = (decision_t *)p; - init_viterbi615_sse2(vp,0); - return vp; -} - -void set_viterbi615_polynomial_sse2(int polys[6]){ - int state; - int i; - - for(state=0;state < 8192;state++){ - for(i=0;i<6;i++) - Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; - } - Init++; -} - -/* Viterbi chainback */ -int chainback_viterbi615_sse2( - void *p, - unsigned char *data, /* Decoded output data */ - unsigned int nbits, /* Number of data bits */ - unsigned int endstate){ /* Terminal encoder state */ - struct v615 *vp = p; - decision_t *d = (decision_t *)vp->decisions; - - endstate %= 16384; - - /* The store into data[] only needs to be done every 8 bits. - * But this avoids a conditional branch, and the writes will - * combine in the cache anyway - */ - d += 14; /* Look past tail */ - while(nbits-- != 0){ - int k; - - k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1; - endstate = (k << 13) | (endstate >> 1); - data[nbits>>3] = endstate >> 6; - } - return 0; -} - -/* Delete instance of a Viterbi decoder */ -void delete_viterbi615_sse2(void *p){ - struct v615 *vp = p; - - if(vp != NULL){ - free(vp->decisions); - free(vp); - } -} - - -int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits){ - struct v615 *vp = p; - decision_t *d = (decision_t *)vp->dp; - - while(nbits--){ - __m128i sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; - void *tmp; - int i; - - /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ - sym0v = _mm_set1_epi16(syms[0]); - sym1v = _mm_set1_epi16(syms[1]); - sym2v = _mm_set1_epi16(syms[2]); - sym3v = _mm_set1_epi16(syms[3]); - sym4v = _mm_set1_epi16(syms[4]); - sym5v = _mm_set1_epi16(syms[5]); - syms += 6; - - /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */ - for(i=0;i<1024;i++){ - __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; - - /* Form branch metrics - * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, - * the XOR operations constitute conditional negation. - * metric and m_metric (-metric) are in the range 0-1530 - */ - m0 = _mm_add_epi16(_mm_xor_si128(Branchtab615[0].v[i],sym0v),_mm_xor_si128(Branchtab615[1].v[i],sym1v)); - m1 = _mm_add_epi16(_mm_xor_si128(Branchtab615[2].v[i],sym2v),_mm_xor_si128(Branchtab615[3].v[i],sym3v)); - m2 = _mm_add_epi16(_mm_xor_si128(Branchtab615[4].v[i],sym4v),_mm_xor_si128(Branchtab615[5].v[i],sym5v)); - metric = _mm_add_epi16(m0,_mm_add_epi16(m1,m2)); - m_metric = _mm_sub_epi16(_mm_set1_epi16(1530),metric); - - /* Add branch metrics to path metrics */ - m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric); - m3 = _mm_adds_epi16(vp->old_metrics->v[1024+i],metric); - m1 = _mm_adds_epi16(vp->old_metrics->v[1024+i],m_metric); - m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric); - - /* Compare and select */ - survivor0 = _mm_min_epi16(m0,m1); - survivor1 = _mm_min_epi16(m2,m3); - decision0 = _mm_cmpeq_epi16(survivor0,m1); - decision1 = _mm_cmpeq_epi16(survivor1,m3); - - /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */ - d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128()))); - - /* Store surviving metrics */ - vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1); - vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1); - } - /* See if we need to renormalize - * Max metric spread for this code with 0-90 branch metrics is 405 - */ - if(vp->new_metrics->s[0] >= SHRT_MAX-12750){ - int i,adjust; - __m128i adjustv; - union { __m128i v; signed short w[8]; } t; - - /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ - adjustv = vp->new_metrics->v[0]; - for(i=1;i<2048;i++) - adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]); - - adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8)); - adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4)); - adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2)); - t.v = adjustv; - adjust = t.w[0] - SHRT_MIN; - adjustv = _mm_set1_epi16(adjust); - - /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX - * This is okay since it can't overflow anyway - */ - for(i=0;i<2048;i++) - vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv); - } - d++; - /* Swap pointers to old and new metrics */ - tmp = vp->old_metrics; - vp->old_metrics = vp->new_metrics; - vp->new_metrics = tmp; - } - vp->dp = d; - return 0; -} - - diff --git a/libfec/vtest27.c b/libfec/vtest27.c deleted file mode 100644 index 7256483..0000000 --- a/libfec/vtest27.c +++ /dev/null @@ -1,184 +0,0 @@ -/* Test viterbi decoder speeds */ -#include "config.h" -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef HAVE_GETOPT_H -#include -#endif -#include "fec.h" - -#if HAVE_GETOPT_LONG -struct option Options[] = { - {"frame-length",1,NULL,'l'}, - {"frame-count",1,NULL,'n'}, - {"ebn0",1,NULL,'e'}, - {"gain",1,NULL,'g'}, - {"verbose",0,NULL,'v'}, - {"force-altivec",0,NULL,'a'}, - {"force-port",0,NULL,'p'}, - {"force-mmx",0,NULL,'m'}, - {"force-sse",0,NULL,'s'}, - {"force-sse2",0,NULL,'t'}, - {NULL}, -}; -#endif - -#define RATE (1./2.) -#define MAXBYTES 10000 - -double Gain = 32.0; -int Verbose = 0; - -int main(int argc,char *argv[]){ - int i,d,tr; - int sr=0,trials = 10000,errcnt,framebits=2048; - long long int tot_errs=0; - unsigned char bits[MAXBYTES]; - unsigned char data[MAXBYTES]; - unsigned char xordata[MAXBYTES]; - unsigned char symbols[8*2*(MAXBYTES+6)]; - void *vp; - extern char *optarg; - struct rusage start,finish; - double extime; - double gain,esn0,ebn0; - time_t t; - int badframes=0; - - time(&t); - srandom(t); - ebn0 = -100; -#if HAVE_GETOPT_LONG - while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ -#else - while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ -#endif - switch(d){ - case 'a': - Cpu_mode = ALTIVEC; - break; - case 'p': - Cpu_mode = PORT; - break; - case 'm': - Cpu_mode = MMX; - break; - case 's': - Cpu_mode = SSE; - break; - case 't': - Cpu_mode = SSE2; - break; - case 'l': - framebits = atoi(optarg); - break; - case 'n': - trials = atoi(optarg); - break; - case 'e': - ebn0 = atof(optarg); - break; - case 'g': - Gain = atof(optarg); - break; - case 'v': - Verbose++; - break; - } - } - if(framebits > 8*MAXBYTES){ - fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); - framebits = MAXBYTES*8; - } - if((vp = create_viterbi27(framebits)) == NULL){ - printf("create_viterbi27 failed\n"); - exit(1); - } - if(ebn0 != -100){ - esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ - /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing - * only half the noise power, and the sqrt() converts power to - * voltage. - */ - gain = 1./sqrt(0.5/pow(10.,esn0/10.)); - - printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); - - for(tr=0;tr 1 && errcnt != 0){ - printf("frame %d, %d errors: ",tr,errcnt); - for(i=0;i 1) - printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); - else if(Verbose == 0) - printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", - tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), - badframes,tr+1,(double)badframes/(tr+1)); - else - printf("\n"); - - } else { - /* Do time trials */ - memset(symbols,127,sizeof(symbols)); - printf("Starting time trials\n"); - getrusage(RUSAGE_SELF,&start); - for(tr=0;tr < trials;tr++){ - /* Initialize Viterbi decoder */ - init_viterbi27(vp,0); - - /* Decode block */ - update_viterbi27_blk(vp,symbols,framebits); - - /* Do Viterbi chainback */ - chainback_viterbi27(vp,data,framebits,0); - } - getrusage(RUSAGE_SELF,&finish); - extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); - printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, - framebits,extime); - printf("decoder speed: %g bits/s\n",trials*framebits/extime); - } - exit(0); -} diff --git a/libfec/vtest29.c b/libfec/vtest29.c deleted file mode 100644 index 8471b54..0000000 --- a/libfec/vtest29.c +++ /dev/null @@ -1,185 +0,0 @@ -/* Test viterbi decoder speeds */ -#include "config.h" -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef HAVE_GETOPT_H -#include -#endif -#include "fec.h" - -#if HAVE_GETOPT_LONG -struct option Options[] = { - {"frame-length",1,NULL,'l'}, - {"frame-count",1,NULL,'n'}, - {"ebn0",1,NULL,'e'}, - {"gain",1,NULL,'g'}, - {"verbose",0,NULL,'v'}, - {"force-altivec",0,NULL,'a'}, - {"force-port",0,NULL,'p'}, - {"force-mmx",0,NULL,'m'}, - {"force-sse",0,NULL,'s'}, - {"force-sse2",0,NULL,'t'}, - {NULL}, -}; -#endif - -#define RATE (1./2.) -#define MAXBYTES 10000 - -double Gain = 32.0; -int Verbose = 0; - -int main(int argc,char *argv[]){ - int i,d,tr; - int sr=0,trials = 10000,errcnt,framebits=2048; - long long tot_errs=0; - unsigned char bits[MAXBYTES]; - unsigned char data[MAXBYTES]; - unsigned char xordata[MAXBYTES]; - unsigned char symbols[8*2*(MAXBYTES+8)]; - void *vp; - extern char *optarg; - struct rusage start,finish; - double extime; - double gain,esn0,ebn0; - time_t t; - int badframes=0; - - time(&t); - srandom(t); - ebn0 = -100; -#if HAVE_GETOPT_LONG - while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ -#else - while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ -#endif - switch(d){ - case 'a': - Cpu_mode = ALTIVEC; - break; - case 'p': - Cpu_mode = PORT; - break; - case 'm': - Cpu_mode = MMX; - break; - case 's': - Cpu_mode = SSE; - break; - case 't': - Cpu_mode = SSE2; - break; - case 'l': - framebits = atoi(optarg); - break; - case 'n': - trials = atoi(optarg); - break; - case 'e': - ebn0 = atof(optarg); - break; - case 'g': - Gain = atof(optarg); - break; - case 'v': - Verbose++; - break; - } - } - if(framebits > 8*MAXBYTES){ - fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); - framebits = MAXBYTES*8; - } - if((vp = create_viterbi29(framebits)) == NULL){ - printf("create_viterbi29 failed\n"); - exit(1); - } - if(ebn0 != -100){ - esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ - /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing - * only half the noise power, and the sqrt() converts power to - * voltage. - */ - gain = 1./sqrt(0.5/pow(10.,esn0/10.)); - - printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); - - for(tr=0;tr 1 && errcnt != 0){ - printf("frame %d, %d errors: ",tr,errcnt); - for(i=0;i 1) - printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); - else if(Verbose == 0) - printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", - tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), - badframes,tr+1,(double)badframes/(tr+1)); - else - printf("\n"); - } else { - /* Do time trials */ - memset(symbols,127,sizeof(symbols)); - printf("Starting time trials\n"); - getrusage(RUSAGE_SELF,&start); - for(tr=0;tr < trials;tr++){ - /* Initialize Viterbi decoder */ - init_viterbi29(vp,0); - - /* Decode block */ - update_viterbi29_blk(vp,symbols,framebits); - - /* Do Viterbi chainback */ - chainback_viterbi29(vp,data,framebits,0); - } - getrusage(RUSAGE_SELF,&finish); - extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); - printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, - framebits,extime); - printf("decoder speed: %g bits/s\n",trials*framebits/extime); - } - exit(0); -} - - diff --git a/libfec/vtest39.c b/libfec/vtest39.c deleted file mode 100644 index 76723b2..0000000 --- a/libfec/vtest39.c +++ /dev/null @@ -1,186 +0,0 @@ -/* Test viterbi decoder speeds */ -#include "config.h" -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef HAVE_GETOPT_H -#include -#endif -#include "fec.h" - -#if HAVE_GETOPT_LONG -struct option Options[] = { - {"frame-length",1,NULL,'l'}, - {"frame-count",1,NULL,'n'}, - {"ebn0",1,NULL,'e'}, - {"gain",1,NULL,'g'}, - {"verbose",0,NULL,'v'}, - {"force-altivec",0,NULL,'a'}, - {"force-port",0,NULL,'p'}, - {"force-mmx",0,NULL,'m'}, - {"force-sse",0,NULL,'s'}, - {"force-sse2",0,NULL,'t'}, - {NULL}, -}; -#endif - -#define RATE (1./3.) -#define MAXBYTES 10000 - -double Gain = 32.0; -int Verbose = 0; - -int main(int argc,char *argv[]){ - int i,d,tr; - int sr=0,trials = 10000,errcnt,framebits=2048; - long long tot_errs=0; - unsigned char bits[MAXBYTES]; - unsigned char data[MAXBYTES]; - unsigned char xordata[MAXBYTES]; - unsigned char symbols[8*3*(MAXBYTES+8)]; - void *vp; - extern char *optarg; - struct rusage start,finish; - double extime; - double gain,esn0,ebn0; - time_t t; - int badframes=0; - - time(&t); - srandom(t); - ebn0 = -100; -#if HAVE_GETOPT_LONG - while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ -#else - while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ -#endif - switch(d){ - case 'a': - Cpu_mode = ALTIVEC; - break; - case 'p': - Cpu_mode = PORT; - break; - case 'm': - Cpu_mode = MMX; - break; - case 's': - Cpu_mode = SSE; - break; - case 't': - Cpu_mode = SSE2; - break; - case 'l': - framebits = atoi(optarg); - break; - case 'n': - trials = atoi(optarg); - break; - case 'e': - ebn0 = atof(optarg); - break; - case 'g': - Gain = atof(optarg); - break; - case 'v': - Verbose++; - break; - } - } - if(framebits > 8*MAXBYTES){ - fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); - framebits = MAXBYTES*8; - } - if((vp = create_viterbi39(framebits)) == NULL){ - printf("create_viterbi39 failed\n"); - exit(1); - } - if(ebn0 != -100){ - esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ - /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing - * only half the noise power, and the sqrt() converts power to - * voltage. - */ - gain = 1./sqrt(0.5/pow(10.,esn0/10.)); - - printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); - - for(tr=0;tr 1 && errcnt != 0){ - printf("frame %d, %d errors: ",tr,errcnt); - for(i=0;i 1) - printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); - else if(Verbose == 0) - printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", - tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), - badframes,tr+1,(double)badframes/(tr+1)); - else - printf("\n"); - } else { - /* Do time trials */ - memset(symbols,127,sizeof(symbols)); - printf("Starting time trials\n"); - getrusage(RUSAGE_SELF,&start); - for(tr=0;tr < trials;tr++){ - /* Initialize Viterbi decoder */ - init_viterbi39(vp,0); - - /* Decode block */ - update_viterbi39_blk(vp,symbols,framebits); - - /* Do Viterbi chainback */ - chainback_viterbi39(vp,data,framebits,0); - } - getrusage(RUSAGE_SELF,&finish); - extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); - printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, - framebits,extime); - printf("decoder speed: %g bits/s\n",trials*framebits/extime); - } - exit(0); -} - - diff --git a/libfec/vtest615.c b/libfec/vtest615.c deleted file mode 100644 index 4bd8c4f..0000000 --- a/libfec/vtest615.c +++ /dev/null @@ -1,191 +0,0 @@ -/* Test viterbi decoder speeds */ -#include "config.h" -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef HAVE_GETOPT_H -#include -#endif -#include "fec.h" - -#if HAVE_GETOPT_LONG -struct option Options[] = { - {"frame-length",1,NULL,'l'}, - {"frame-count",1,NULL,'n'}, - {"ebn0",1,NULL,'e'}, - {"gain",1,NULL,'g'}, - {"verbose",0,NULL,'v'}, - {"force-altivec",0,NULL,'a'}, - {"force-port",0,NULL,'p'}, - {"force-mmx",0,NULL,'m'}, - {"force-sse",0,NULL,'s'}, - {"force-sse2",0,NULL,'t'}, - {NULL}, -}; -#endif - -#define RATE (1./6.) -#define MAXBYTES 10000 -#define OFFSET (127.5) -#define CLIP 255 - -double Gain = 24.0; -int Verbose = 0; - -int main(int argc,char *argv[]){ - int i,d,tr; - int sr=0,trials = 10,errcnt,framebits=2048; - int tot_errs=0; - unsigned char bits[MAXBYTES]; - unsigned char data[MAXBYTES]; - unsigned char xordata[MAXBYTES]; - unsigned char symbols[8*6*(MAXBYTES+14)]; - void *vp; - extern char *optarg; - struct rusage start,finish; - double extime; - double gain,esn0,ebn0; - time_t t; - int badframes=0; - - time(&t); - srandom(t); - ebn0 = -100; -#if HAVE_GETOPT_LONG - while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ -#else - while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ -#endif - switch(d){ - case 'a': - Cpu_mode = ALTIVEC; - break; - case 'p': - Cpu_mode = PORT; - break; - case 'm': - Cpu_mode = MMX; - break; - case 's': - Cpu_mode = SSE; - break; - case 't': - Cpu_mode = SSE2; - break; - case 'l': - framebits = atoi(optarg); - break; - case 'n': - trials = atoi(optarg); - break; - case 'e': - ebn0 = atof(optarg); - break; - case 'g': - Gain = atof(optarg); - break; - case 'v': - Verbose++; - break; - } - } - if(framebits > 8*MAXBYTES){ - fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); - framebits = MAXBYTES*8; - } - if((vp = create_viterbi615(framebits)) == NULL){ - printf("create_viterbi615 failed\n"); - exit(1); - } - if(ebn0 != -100){ - esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ - /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing - * only half the noise power, and the sqrt() converts power to - * voltage. - */ - gain = 1./sqrt(0.5/pow(10.,esn0/10.)); - - printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); - - for(tr=0;tr 1 && errcnt != 0){ - printf("frame %d, %d errors: ",tr,errcnt); - for(i=0;i 1) - printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); - else if(Verbose == 0) - printf("BER %d/%d (%.3g) FER %d/%d (%.3g)\n", - tot_errs,framebits*(tr+1),tot_errs/((double)framebits*(tr+1)), - badframes,(tr+1),(double)badframes/(tr+1)); - else - printf("\n"); - } else { - /* Do time trials */ - memset(symbols,127,sizeof(symbols)); - printf("Starting time trials\n"); - getrusage(RUSAGE_SELF,&start); - for(tr=0;tr < trials;tr++){ - /* Initialize Viterbi decoder */ - init_viterbi615(vp,0); - - /* Decode block */ - update_viterbi615_blk(vp,symbols,framebits+14); - - /* Do Viterbi chainback */ - chainback_viterbi615(vp,data,framebits,0); - } - getrusage(RUSAGE_SELF,&finish); - extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); - printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, - framebits,extime); - printf("decoder speed: %g bits/s\n",trials*framebits/extime); - } - exit(0); -}