diff --git a/CMakeLists.txt b/CMakeLists.txt index 4be403f..34ce3db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,7 +131,6 @@ find_package(Volk REQUIRED) find_package(OggVorbis REQUIRED) find_package(PNG REQUIRED) find_package(png++ REQUIRED) -find_package(Fec REQUIRED) ######################################################################## # Include or not into the module blocks for debugging @@ -150,6 +149,45 @@ if(${INCLUDE_DEBUG_BLOCKS}) endif() endif() +######################################################################## +# Search for the libfec if it is already installed in the system +# If not, install the internal one. +######################################################################## +find_package(Fec) +if(NOT FEC_FOUND) + message(WARNING "libfec is not installed. The internal libfec will be automatically build and install.") + include(ExternalProject) + ExternalProject_Add(FEC_EXTERNAL + SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/libfec + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libfec + CMAKE_ARGS "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" + "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + INSTALL_COMMAND "" + ) + + ExternalProject_Get_Property(FEC_EXTERNAL binary_dir) + add_library(fec SHARED IMPORTED) + + set_property(TARGET fec PROPERTY IMPORTED_LOCATION ${install_dir}/libfec.so) + + add_dependencies(fec FEC_EXTERNAL) + set(FEC_LIBRARIES "${binary_dir}/libfec.so") + set(FEC_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/libfec") + + # Install the header and the library in the standard places + install(FILES + "${FEC_INCLUDE_DIRS}/fec.h" + DESTINATION "include" + ) + install(FILES + ${FEC_LIBRARIES} + DESTINATION lib${LIB_SUFFIX} + ) +else() + add_library(fec INTERFACE) +endif() + # Search for GNU Radio and its components and versions. Add any # components required to the list of GR_REQUIRED_COMPONENTS (in all # caps such as FILTER or FFT) and change the version to the minimum diff --git a/apps/flowgraphs/debug_afsk_transceiver_osmocom.py b/apps/flowgraphs/debug_afsk_transceiver_osmocom.py new file mode 100755 index 0000000..a9e0892 --- /dev/null +++ b/apps/flowgraphs/debug_afsk_transceiver_osmocom.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +################################################## +# GNU Radio Python Flow Graph +# Title: Debug Afsk Transceiver Osmocom +# Generated: Mon Jun 13 20:30:12 2016 +################################################## + +if __name__ == '__main__': + import ctypes + import sys + if sys.platform.startswith('linux'): + try: + x11 = ctypes.cdll.LoadLibrary('libX11.so') + x11.XInitThreads() + except: + print "Warning: failed to XInitThreads()" + +from PyQt4 import Qt +from gnuradio import analog +from gnuradio import audio +from gnuradio import blocks +from gnuradio import eng_notation +from gnuradio import filter +from gnuradio import gr +from gnuradio import qtgui +from gnuradio.eng_option import eng_option +from gnuradio.filter import firdes +from gnuradio.qtgui import Range, RangeWidget +from optparse import OptionParser +import math +import numpy +import satnogs +import sip +import sys + + +class debug_afsk_transceiver_osmocom(gr.top_block, Qt.QWidget): + + def __init__(self): + gr.top_block.__init__(self, "Debug Afsk Transceiver Osmocom") + Qt.QWidget.__init__(self) + self.setWindowTitle("Debug Afsk Transceiver Osmocom") + try: + self.setWindowIcon(Qt.QIcon.fromTheme('gnuradio-grc')) + except: + pass + self.top_scroll_layout = Qt.QVBoxLayout() + self.setLayout(self.top_scroll_layout) + self.top_scroll = Qt.QScrollArea() + self.top_scroll.setFrameStyle(Qt.QFrame.NoFrame) + self.top_scroll_layout.addWidget(self.top_scroll) + self.top_scroll.setWidgetResizable(True) + self.top_widget = Qt.QWidget() + self.top_scroll.setWidget(self.top_widget) + self.top_layout = Qt.QVBoxLayout(self.top_widget) + self.top_grid_layout = Qt.QGridLayout() + self.top_layout.addLayout(self.top_grid_layout) + + self.settings = Qt.QSettings("GNU Radio", "debug_afsk_transceiver_osmocom") + self.restoreGeometry(self.settings.value("geometry").toByteArray()) + + ################################################## + # Variables + ################################################## + self.samples_per_symbol_tx = samples_per_symbol_tx = 4 + self.sq_wave = sq_wave = (1.0, ) * samples_per_symbol_tx + self.gaussian_taps = gaussian_taps = filter.firdes.gaussian(1.0, samples_per_symbol_tx, 1.0, 4*samples_per_symbol_tx) + self.deviation = deviation = 800 + self.baud_rate = baud_rate = 1200 + self.tx_frequency = tx_frequency = 145.835e6 + self.samp_rate_tx = samp_rate_tx = 48e3 + self.modulation_index = modulation_index = deviation / (baud_rate / 2.0) + self.interp_taps = interp_taps = numpy.convolve(numpy.array(gaussian_taps), numpy.array(sq_wave)) + self.atten = atten = 0.1 + + ################################################## + # Blocks + ################################################## + self._atten_range = Range(0, 0.9, 0.01, 0.1, 200) + self._atten_win = RangeWidget(self._atten_range, self.set_atten, "Attenuation", "counter_slider", float) + self.top_layout.addWidget(self._atten_win) + self.satnogs_upsat_fsk_frame_encoder_0 = satnogs.upsat_fsk_frame_encoder([0x33]*8, [0x7A, 0x0E], False, False, False, True, True, "ABCD", 0, "UPSAT", 0, 1024) + self.satnogs_udp_msg_source_0 = satnogs.udp_msg_source("", 16886, 1500) + self.satnogs_debug_msg_source_0 = satnogs.debug_msg_source("HELLO"*4, 1, True) + self.rational_resampler_xxx_0 = filter.rational_resampler_ccc( + interpolation=10, + decimation=1, + taps=None, + fractional_bw=None, + ) + self.qtgui_time_sink_x_0_0_0 = qtgui.time_sink_c( + 1024, #size + samp_rate_tx, #samp_rate + "", #name + 1 #number of inputs + ) + self.qtgui_time_sink_x_0_0_0.set_update_time(0.10) + self.qtgui_time_sink_x_0_0_0.set_y_axis(-1, 1) + + self.qtgui_time_sink_x_0_0_0.set_y_label("Amplitude", "") + + self.qtgui_time_sink_x_0_0_0.enable_tags(-1, True) + self.qtgui_time_sink_x_0_0_0.set_trigger_mode(qtgui.TRIG_MODE_FREE, qtgui.TRIG_SLOPE_POS, 0.0, 0, 0, "") + self.qtgui_time_sink_x_0_0_0.enable_autoscale(False) + self.qtgui_time_sink_x_0_0_0.enable_grid(False) + self.qtgui_time_sink_x_0_0_0.enable_control_panel(True) + + if not True: + self.qtgui_time_sink_x_0_0_0.disable_legend() + + labels = ["", "", "", "", "", + "", "", "", "", ""] + widths = [1, 1, 1, 1, 1, + 1, 1, 1, 1, 1] + colors = ["blue", "red", "green", "black", "cyan", + "magenta", "yellow", "dark red", "dark green", "blue"] + styles = [1, 1, 1, 1, 1, + 1, 1, 1, 1, 1] + markers = [2, -1, -1, -1, -1, + -1, -1, -1, -1, -1] + alphas = [1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0] + + for i in xrange(2*1): + if len(labels[i]) == 0: + if(i % 2 == 0): + self.qtgui_time_sink_x_0_0_0.set_line_label(i, "Re{{Data {0}}}".format(i/2)) + else: + self.qtgui_time_sink_x_0_0_0.set_line_label(i, "Im{{Data {0}}}".format(i/2)) + else: + self.qtgui_time_sink_x_0_0_0.set_line_label(i, labels[i]) + self.qtgui_time_sink_x_0_0_0.set_line_width(i, widths[i]) + self.qtgui_time_sink_x_0_0_0.set_line_color(i, colors[i]) + self.qtgui_time_sink_x_0_0_0.set_line_style(i, styles[i]) + self.qtgui_time_sink_x_0_0_0.set_line_marker(i, markers[i]) + self.qtgui_time_sink_x_0_0_0.set_line_alpha(i, alphas[i]) + + self._qtgui_time_sink_x_0_0_0_win = sip.wrapinstance(self.qtgui_time_sink_x_0_0_0.pyqwidget(), Qt.QWidget) + self.top_layout.addWidget(self._qtgui_time_sink_x_0_0_0_win) + self.qtgui_time_sink_x_0_0 = qtgui.time_sink_f( + 1024, #size + samp_rate_tx, #samp_rate + "", #name + 1 #number of inputs + ) + self.qtgui_time_sink_x_0_0.set_update_time(0.10) + self.qtgui_time_sink_x_0_0.set_y_axis(-1, 1) + + self.qtgui_time_sink_x_0_0.set_y_label("Amplitude", "") + + self.qtgui_time_sink_x_0_0.enable_tags(-1, True) + self.qtgui_time_sink_x_0_0.set_trigger_mode(qtgui.TRIG_MODE_FREE, qtgui.TRIG_SLOPE_POS, 0.0, 0, 0, "") + self.qtgui_time_sink_x_0_0.enable_autoscale(False) + self.qtgui_time_sink_x_0_0.enable_grid(False) + self.qtgui_time_sink_x_0_0.enable_control_panel(True) + + if not True: + self.qtgui_time_sink_x_0_0.disable_legend() + + labels = ["", "", "", "", "", + "", "", "", "", ""] + widths = [1, 1, 1, 1, 1, + 1, 1, 1, 1, 1] + colors = ["blue", "red", "green", "black", "cyan", + "magenta", "yellow", "dark red", "dark green", "blue"] + styles = [1, 1, 1, 1, 1, + 1, 1, 1, 1, 1] + markers = [2, -1, -1, -1, -1, + -1, -1, -1, -1, -1] + alphas = [1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0] + + for i in xrange(1): + if len(labels[i]) == 0: + self.qtgui_time_sink_x_0_0.set_line_label(i, "Data {0}".format(i)) + else: + self.qtgui_time_sink_x_0_0.set_line_label(i, labels[i]) + self.qtgui_time_sink_x_0_0.set_line_width(i, widths[i]) + self.qtgui_time_sink_x_0_0.set_line_color(i, colors[i]) + self.qtgui_time_sink_x_0_0.set_line_style(i, styles[i]) + self.qtgui_time_sink_x_0_0.set_line_marker(i, markers[i]) + self.qtgui_time_sink_x_0_0.set_line_alpha(i, alphas[i]) + + self._qtgui_time_sink_x_0_0_win = sip.wrapinstance(self.qtgui_time_sink_x_0_0.pyqwidget(), Qt.QWidget) + self.top_layout.addWidget(self._qtgui_time_sink_x_0_0_win) + self.interp_fir_filter_xxx_0 = filter.interp_fir_filter_fff(samples_per_symbol_tx, (interp_taps)) + self.interp_fir_filter_xxx_0.declare_sample_delay(0) + self.blocks_vco_f_0 = blocks.vco_f(48e3, -48e3, 1.0) + self.blocks_multiply_const_vxx_0 = blocks.multiply_const_vff((atten, )) + self.audio_sink_0_0 = audio.sink(48000, "", True) + self.analog_quadrature_demod_cf_0 = analog.quadrature_demod_cf(48e3/(2*math.pi*deviation/8.0)) + self.analog_frequency_modulator_fc_0 = analog.frequency_modulator_fc((math.pi*modulation_index) / samples_per_symbol_tx) + + ################################################## + # Connections + ################################################## + self.msg_connect((self.satnogs_debug_msg_source_0, 'msg'), (self.satnogs_upsat_fsk_frame_encoder_0, 'pdu')) + self.msg_connect((self.satnogs_udp_msg_source_0, 'msg'), (self.satnogs_upsat_fsk_frame_encoder_0, 'pdu')) + self.connect((self.analog_frequency_modulator_fc_0, 0), (self.qtgui_time_sink_x_0_0_0, 0)) + self.connect((self.analog_frequency_modulator_fc_0, 0), (self.rational_resampler_xxx_0, 0)) + self.connect((self.analog_quadrature_demod_cf_0, 0), (self.blocks_multiply_const_vxx_0, 0)) + self.connect((self.blocks_multiply_const_vxx_0, 0), (self.audio_sink_0_0, 0)) + self.connect((self.blocks_vco_f_0, 0), (self.qtgui_time_sink_x_0_0, 0)) + self.connect((self.interp_fir_filter_xxx_0, 0), (self.analog_frequency_modulator_fc_0, 0)) + self.connect((self.interp_fir_filter_xxx_0, 0), (self.blocks_vco_f_0, 0)) + self.connect((self.rational_resampler_xxx_0, 0), (self.analog_quadrature_demod_cf_0, 0)) + self.connect((self.satnogs_upsat_fsk_frame_encoder_0, 0), (self.interp_fir_filter_xxx_0, 0)) + + def closeEvent(self, event): + self.settings = Qt.QSettings("GNU Radio", "debug_afsk_transceiver_osmocom") + self.settings.setValue("geometry", self.saveGeometry()) + event.accept() + + + def get_samples_per_symbol_tx(self): + return self.samples_per_symbol_tx + + def set_samples_per_symbol_tx(self, samples_per_symbol_tx): + self.samples_per_symbol_tx = samples_per_symbol_tx + self.set_gaussian_taps(filter.firdes.gaussian(1.0, self.samples_per_symbol_tx, 1.0, 4*self.samples_per_symbol_tx)) + self.set_sq_wave((1.0, ) * self.samples_per_symbol_tx) + self.analog_frequency_modulator_fc_0.set_sensitivity((math.pi*self.modulation_index) / self.samples_per_symbol_tx) + + def get_sq_wave(self): + return self.sq_wave + + def set_sq_wave(self, sq_wave): + self.sq_wave = sq_wave + self.set_interp_taps(numpy.convolve(numpy.array(self.gaussian_taps), numpy.array(self.sq_wave))) + + def get_gaussian_taps(self): + return self.gaussian_taps + + def set_gaussian_taps(self, gaussian_taps): + self.gaussian_taps = gaussian_taps + self.set_interp_taps(numpy.convolve(numpy.array(self.gaussian_taps), numpy.array(self.sq_wave))) + + def get_deviation(self): + return self.deviation + + def set_deviation(self, deviation): + self.deviation = deviation + self.set_modulation_index(self.deviation / (self.baud_rate / 2.0)) + self.analog_quadrature_demod_cf_0.set_gain(48e3/(2*math.pi*self.deviation/8.0)) + + def get_baud_rate(self): + return self.baud_rate + + def set_baud_rate(self, baud_rate): + self.baud_rate = baud_rate + self.set_modulation_index(self.deviation / (self.baud_rate / 2.0)) + + def get_tx_frequency(self): + return self.tx_frequency + + def set_tx_frequency(self, tx_frequency): + self.tx_frequency = tx_frequency + + def get_samp_rate_tx(self): + return self.samp_rate_tx + + def set_samp_rate_tx(self, samp_rate_tx): + self.samp_rate_tx = samp_rate_tx + self.qtgui_time_sink_x_0_0_0.set_samp_rate(self.samp_rate_tx) + self.qtgui_time_sink_x_0_0.set_samp_rate(self.samp_rate_tx) + + def get_modulation_index(self): + return self.modulation_index + + def set_modulation_index(self, modulation_index): + self.modulation_index = modulation_index + self.analog_frequency_modulator_fc_0.set_sensitivity((math.pi*self.modulation_index) / self.samples_per_symbol_tx) + + def get_interp_taps(self): + return self.interp_taps + + def set_interp_taps(self, interp_taps): + self.interp_taps = interp_taps + self.interp_fir_filter_xxx_0.set_taps((self.interp_taps)) + + def get_atten(self): + return self.atten + + def set_atten(self, atten): + self.atten = atten + self.blocks_multiply_const_vxx_0.set_k((self.atten, )) + + +def main(top_block_cls=debug_afsk_transceiver_osmocom, options=None): + + from distutils.version import StrictVersion + if StrictVersion(Qt.qVersion()) >= StrictVersion("4.5.0"): + style = gr.prefs().get_string('qtgui', 'style', 'raster') + Qt.QApplication.setGraphicsSystem(style) + qapp = Qt.QApplication(sys.argv) + + tb = top_block_cls() + tb.start() + tb.show() + + def quitting(): + tb.stop() + tb.wait() + qapp.connect(qapp, Qt.SIGNAL("aboutToQuit()"), quitting) + qapp.exec_() + + +if __name__ == '__main__': + main() diff --git a/apps/flowgraphs/device_args_handler.py b/apps/flowgraphs/device_args_handler.py new file mode 100644 index 0000000..346a07f --- /dev/null +++ b/apps/flowgraphs/device_args_handler.py @@ -0,0 +1,5 @@ +# this module will be imported in the into your flowgraph + +def append_dev_args(device, dev_args): + if(len(dev_args) == 0): + return 0 diff --git a/apps/flowgraphs/satellites/mpla.ogg b/apps/flowgraphs/satellites/mpla.ogg new file mode 100644 index 0000000..67b83eb Binary files /dev/null and b/apps/flowgraphs/satellites/mpla.ogg differ diff --git a/cmake/Modules/FindFec.cmake b/cmake/Modules/FindFec.cmake index d1197a8..76b4e42 100644 --- a/cmake/Modules/FindFec.cmake +++ b/cmake/Modules/FindFec.cmake @@ -22,4 +22,4 @@ FIND_LIBRARY( ) INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS(FEC DEFAULT_MSG FEC_LIBRARIES FEC_INCLUDE_DIRS) \ No newline at end of file +FIND_PACKAGE_HANDLE_STANDARD_ARGS(FEC DEFAULT_MSG FEC_LIBRARIES FEC_INCLUDE_DIRS) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 8bf9a01..e2a6153 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -81,6 +81,9 @@ if(NOT satnogs_sources) endif(NOT satnogs_sources) add_library(gnuradio-satnogs SHARED ${satnogs_sources}) + +add_dependencies(gnuradio-satnogs fec) + target_link_libraries(gnuradio-satnogs ${Boost_LIBRARIES} ${GNURADIO_ALL_LIBRARIES} diff --git a/libfec/CMakeLists.txt b/libfec/CMakeLists.txt new file mode 100644 index 0000000..684a6d0 --- /dev/null +++ b/libfec/CMakeLists.txt @@ -0,0 +1,323 @@ +######################################################################## +# Project setup +######################################################################## +cmake_minimum_required(VERSION 2.8) +project(libfec ASM C) + +option(BUILD_32BIT_ON_64BIT "Build a 32-bit library on a 64-bit system" OFF) + +# Select the release build type by default to get optimization flags +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release") + message(STATUS "Build type not specified: defaulting to release.") +endif(NOT CMAKE_BUILD_TYPE) +set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "") + +list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) + +if(NOT LIB_INSTALL_DIR) + set(LIB_INSTALL_DIR lib) +endif() + + +######################################################################## +# Version information +######################################################################## +set(VERSION_INFO_MAJOR 3) +set(VERSION_INFO_MINOR 0) +set(VERSION_INFO_PATCH 0) + +if(NOT DEFINED VERSION_INFO_EXTRA) + set(VERSION_INFO_EXTRA "git") +endif() +include(Version) + +if(NOT DEFINED VERSION) + #set(VERSION "\"${VERSION_INFO_MAJOR}.${VERSION_INFO_MINOR}.${VERSION_INFO_PATCH}\"") + set(VERSION "\"${VERSION_INFO}\"") +endif() + + +######################################################################## +# Compiler specific setup +######################################################################## +if(BUILD_32BIT_ON_64BIT) + set(CMAKE_SYSTEM_PROCESSOR "i386") + set(CMAKE_SIZEOF_VOID_P 4) + set(CMAKE_C_FLAGS -m32) + set(CMAKE_CXX_FLAGS -m32) + add_definitions(-m32) +endif() + +if((CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i686|x86|AMD64") AND (CMAKE_SIZEOF_VOID_P EQUAL 4)) + set(TARGET_ARCH "x86") +elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") AND (CMAKE_SIZEOF_VOID_P EQUAL 8)) + set(TARGET_ARCH "x64") +elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "i386") AND (CMAKE_SIZEOF_VOID_P EQUAL 8) AND (APPLE)) + # Mac is weird like that. + set(TARGET_ARCH "x64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm*") + set(TARGET_ARCH "ARM") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64le") + set(TARGET_ARCH "ppc64" "ppc64le") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") + set(TARGET_ARCH "ppc64" "ppc") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") + set(TARGET_ARCH "ppc") +endif() + + +if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_CLANGCC) + add_definitions(-Wall) + add_definitions(-Wno-unused) + + if(TARGET_ARCH MATCHES "x64") + add_definitions(-fPIC) + add_definitions(-msse2) + elseif(TARGET_ARCH MATCHES "x86") + add_definitions(-mmmx) + add_definitions(-msse) + add_definitions(-msse2) + elseif(TARGET_ARCH MATCHES "ppc|ppc64") + add_definitions(-fno-common) + add_definitions(-faltivec) + endif() + +endif() + +######################################################################## +# Find build dependencies +######################################################################## + +# libm +find_library(M_LIB m REQUIRED) + + +######################################################################## +# config.h +######################################################################## + +#add_definitions(-DHAVE_CONFIG_H) + +# Checks for includes +include(CheckIncludeFile) +check_include_file("getopt.h" HAVE_GETOPT_H) +check_include_file("stdio.h" HAVE_STDIO_H) +check_include_file("stdlib.h" HAVE_STDLIB_H) +check_include_file("memory.h" HAVE_MEMORY_H) +check_include_file("string.h" HAVE_STRING_H) + +# Checks for functions +include(CheckFunctionExists) +check_function_exists("getopt_long" HAVE_GETOPT_LONG) +check_function_exists("memset" HAVE_MEMSET) +check_function_exists("memmove" HAVE_MEMMOVE) + + +######################################################################## +# Setup apps +######################################################################## + +if(TARGET_ARCH MATCHES "x64") + list(APPEND libfec_sources + dotprod_port.c + peakval_port.c + sumsq.c + sumsq_port.c + cpu_mode_x86_64.c + ##asm + #sse2bfly27-64.s + #sse2bfly29-64.s + ) + +elseif(TARGET_ARCH MATCHES "x86") + list(APPEND libfec_sources + viterbi27_mmx.c + viterbi27_sse.c + viterbi27_sse2.c + viterbi29_mmx.c + viterbi29_sse.c + viterbi29_sse2.c + viterbi39_sse2.c + viterbi39_sse.c + viterbi39_mmx.c + viterbi615_mmx.c + viterbi615_sse.c + viterbi615_sse2.c + dotprod_mmx.c + dotprod_sse2.c + #peakval_mmx.c + #peakval_sse.c + #peakval_sse2.c + sumsq.c + sumsq_port.c + sumsq_sse2.c + sumsq_mmx.c + cpu_mode_x86.c + #asm + cpu_features.s + dotprod_mmx_assist.s + dotprod_sse2_assist.s + mmxbfly27.s + mmxbfly29.s + peak_mmx_assist.s + peak_sse2_assist.s + peak_sse_assist.s + peakval_mmx_assist.s + peakval_sse2_assist.s + peakval_sse_assist.s + sse2bfly27.s + sse2bfly29.s + ssebfly27.s + ssebfly29.s + sumsq_mmx_assist.s + sumsq_sse2_assist.s + ) + +elseif(TARGET_ARCH MATCHES "ppc|ppc64") + list(APPEND libfec_sources + viterbi27_av.c + viterbi29_av.c + viterbi39_av.c + viterbi615_av.c + encode_rs_av.c + dotprod_av.c + sumsq_av.c + peakval_av.c + cpu_mode_ppc.c + ) +else() + list(APPEND libfec_sources + cpu_mode_generic.c + ) + +endif() + + +list(APPEND libfec_sources + fec.c + sim.c + viterbi27.c + viterbi27_port.c + viterbi29.c + viterbi29_port.c + viterbi39.c + viterbi39_port.c + viterbi615.c + viterbi615_port.c + encode_rs_char.c + encode_rs_int.c + encode_rs_8.c + decode_rs_char.c + decode_rs_int.c + decode_rs_8.c + init_rs_char.c + init_rs_int.c + encode_rs_ccsds.c + decode_rs_ccsds.c + dotprod.c + dotprod_port.c + peakval.c + peakval_port.c + sumsq.c + sumsq_port.c + ccsds_tab.c + ccsds_tal.c +) + + +################################################################################ +# Generate pkg-config file +################################################################################ +foreach(inc ${LIBFEC_INCLUDE_DIR}) + list(APPEND LIBFEC_PC_CFLAGS "-I${inc}") +endforeach() + +foreach(lib ${LIBFEC_LIBRARY_DIRS}) + list(APPEND LIBFEC_PC_PRIV_LIBS "-L${lib}") +endforeach() + +set(LIBFEC_PC_PREFIX ${CMAKE_INSTALL_PREFIX}) +set(LIBFEC_PC_EXEC_PREFIX \${prefix}) +set(LIBFEC_PC_LIBDIR \${exec_prefix}/${LIB_INSTALL_DIR}) +set(LIBFEC_PC_INCLUDEDIR \${prefix}/include) +set(LIBFEC_PC_VERSION ${VERSION}) +set(LIBFEC_PC_LIBS "-lfec") + +# Use space-delimiter in the .pc file, rather than CMake's semicolon separator +string(REPLACE ";" " " LIBFEC_PC_CFLAGS "${LIBFEC_PC_CFLAGS}") +string(REPLACE ";" " " LIBFEC_PC_LIBS "${LIBFEC_PC_LIBS}") + +# Unset these to avoid hard-coded paths in a cross-environment +if(CMAKE_CROSSCOMPILING) + unset(LIBFEC_PC_CFLAGS) + unset(LIBFEC_PC_LIBS) +endif() + +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/libfec.pc.in + ${CMAKE_CURRENT_BINARY_DIR}/libfec.pc + @ONLY +) + +install( + FILES ${CMAKE_CURRENT_BINARY_DIR}/libfec.pc + DESTINATION ${LIB_INSTALL_DIR}/pkgconfig/ +) + + +######################################################################## +# Setup libraries +######################################################################## + +# generate ccsds_tab.c +add_executable(gen_ccsds gen_ccsds.c init_rs_char.c) +add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/ccsds_tab.c + COMMAND ${CMAKE_BINARY_DIR}/gen_ccsds > ccsds_tab.c + DEPENDS gen_ccsds +) + +# generate ccsds_tal.c +add_executable(gen_ccsds_tal gen_ccsds_tal.c) +add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/ccsds_tal.c + COMMAND ${CMAKE_BINARY_DIR}/gen_ccsds_tal > ccsds_tal.c + DEPENDS gen_ccsds_tal +) + +# libfec +add_library(libfec_shared SHARED ${libfec_sources}) +set_target_properties(libfec_shared PROPERTIES OUTPUT_NAME fec) +target_link_libraries(libfec_shared ${M_LIB}) + + +install(TARGETS libfec_shared + DESTINATION ${LIB_INSTALL_DIR}) +install(FILES "${PROJECT_SOURCE_DIR}/fec.h" + DESTINATION include) + + +######################################################################## +# Create uninstall target +######################################################################## +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in" + "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" + IMMEDIATE @ONLY) + +add_custom_target(uninstall + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) + + +######################################################################## +# Print Summary +######################################################################## +message(STATUS "") +message(STATUS "##########################################################") +message(STATUS "## Building for version: ${VERSION}") +message(STATUS "## Target Architecture: ${TARGET_ARCH}") +message(STATUS "## Using install prefix: ${CMAKE_INSTALL_PREFIX}") +message(STATUS "##########################################################") +message(STATUS "") + diff --git a/libfec/INSTALL b/libfec/INSTALL new file mode 100644 index 0000000..7c003a2 --- /dev/null +++ b/libfec/INSTALL @@ -0,0 +1,51 @@ +INSTALLATION INSTRUCTIONS + +CMake-based build: + +Works on most platforms. Do + +mkdir build +cd build +cmake .. +make + + +If that fails, try the older automake-based build: + +./bootstrap +./configure +make +make test (optional) +make install (as root) + +By default, "make install" puts the libfec libraries in +/usr/local/lib, the include files in /usr/local/include, and the +manual page in /usr/local/man. + +You may have an old version of the GNU assembler that cannot handle +the relatively new SSE2 mnemonics. Update your version of the GNU +"binutils" package. + +You may obtain the latest binutils package through your normal +distribution channels or from: + +http://sources.redhat.com/binutils/ + +TESTING THE FEC LIBRARY + +After running the ./configure script, optional tests can be built and +run as follows: + +make test + +"make test" tests each routine, using the SIMD versions as +appropriate, verifying correct operation and estimating Viterbi +decoding speeds. These tests should always succeed unless something is +broken. + +28 Mar 2004 +Phil Karn, karn@ka9q.net + +3 Jan 2014 +Matthias P. Braendli, matthias@mpb.li + diff --git a/libfec/LICENSE b/libfec/LICENSE new file mode 100644 index 0000000..5a883d3 --- /dev/null +++ b/libfec/LICENSE @@ -0,0 +1,502 @@ +GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +(This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.) + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + {description} + Copyright (C) {year} {fullname} + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + {signature of Ty Coon}, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! diff --git a/libfec/README b/libfec/README new file mode 100644 index 0000000..68d043e --- /dev/null +++ b/libfec/README @@ -0,0 +1,125 @@ +COPYRIGHT + +This package is copyright 2006 by Phil Karn, KA9Q. It may be used +under the terms of the GNU Lesser General Public License (LGPL). See +the file "lesser.txt" in this package for license details. + +It has been modified by Matthias P. Braendli, HB9EGM, so that it +compiles for x86_64 and for arm. + +For installation instructions, please see INSTALL + +INTRODUCTION + +This package provides a set of functions that implement several +popular forward error correction (FEC) algorithms and several low-level routines +useful in modems implemented with digital signal processing (DSP). + +The following routines are provided: + +1. Viterbi decoders for the following convolutional codes: + +r=1/2 k=7 ("Voyager" code, now a widely used industry standard) +r=1/2 k=9 (Used on the IS-95 CDMA forward link) +r=1/6 k=15 ("Cassini" code, used by several NASA/JPL deep space missions) + +2. Reed-Solomon encoders and decoders for any user-specified code. + +3. Optimized encoder and decoder for the CCSDS-standard (255,223) +Reed-Solomon code, with and without the CCSDS-standard "dual basis" +symbol representation. + +4. Compute dot product between a 16-bit buffer and a set of 16-bit +coefficients. This is the basic DSP primitive for digital filtering +and correlation. + +4. Compute sum of squares of a buffer of 16-bit signed integers. This is +useful in DSP for finding the total energy in a signal. + +5. Find peak value in a buffer of 16-bit signed integers, useful for +scaling a signal to prevent overflow. + +SIMD SUPPORT + +This package automatically makes use of various SIMD (Single +Instruction stream, Multiple Data stream) instruction sets, when +available: MMX, SSE and SSE2 on the IA-32 (Intel) architecture, and +Altivec on the PowerPC G4 and G5 used by Power Macintoshes. + +"Altivec" is a Motorola trademark; Apple calls it "Velocity Engine", +and IBM calls it "VMX". Altivec is roughly comparable to SSE2 on the +IA-32. + +Many of the SIMD versions run more than an order of +magnitude faster than their portable C versions. The available SIMD +instruction sets, if any, are determined at run time and the proper +version of each routine is automatically selected. If no SIMD +instructions are available, the portable C version is invoked by +default. On targets other than IA-32 and PPC, only the portable C +version is built. + +The SIMD-assisted versions generally produce the same results as the C +versions, with a few minor exceptions. The Viterbi decoders in C have +a very slightly greater Eb/No performance due to their use of 32-bit +path metrics. On the other hand, the SIMD versions use the +"saturating" arithmetic available in these instructions to avoid the +integer wraparounds that can occur in C when argument ranges are not +properly constrained. This applies primarily to the "dotprod" (dot +product) function. + +The MMX (MultiMedia eXtensions) instruction set was introduced on +later Pentium CPUs; it is also implemented on the Pentium II and most +AMD CPUs starting with the K6. SSE (SIMD Streaming Extensions) was +introduced in the Pentium III; AMD calls it "3D Now! Professional". +Intel introduced SSE2 on the Pentium 4, and it has been picked up by +later AMD CPUs. SSE support implies MMX support, while SSE2 support +implies both SSE and MMX support. + +The latest IA-32 SIMD instruction set, SSE3 (also known as "Prescott +New Instructions") was introduced in early 2004 with the latest +("Prescott") revision of the Pentium 4. Relatively little was +introduced with SSE3, and this library currently makes no use of it. + +See the various manual pages for details on how to use the library +routines. + +Copyright 2006, Phil Karn, KA9Q +karn@ka9q.net +http://www.ka9q.net/ + +This software may be used under the terms of the GNU Lesser General +Public License (LGPL); see the file lesser.txt for details. + +Revision history: +Version 1.0 released 29 May 2001 + +Version 2.0 released 3 Dec 2001: +Restructured to add support for shared libraries. + +Version 2.0.1 released 8 Dec 2001: +Includes autoconf/configure script + +Version 2.0.2 released 4 Feb 2002: +Add SIMD version override options +Test for lack of SSE2 mnemonic support in 'as' +Build only selected version + +Version 2.0.3 released 6 Feb 2002: +Fix to parityb function in parity.h + +feclib version 1.0 released November 2003 +Merged SIMD-Viterbi, RS and DSP libraries +Changed SIMD Viterbi decoder to detect SSE2/SSE/MMX at runtime rather than build time + +feclib version 2.0 (unreleased) Mar 2004 +General speedups and cleanups +Switch from 4 to 8-bit input symbols on all Viterbi decoders +Support for Altivec on PowerPC +Support for k=15 r=1/6 Cassini/Mars Pathfinder/Mars Exploration Rover/STEREO code +Changed license to GNU Lesser General Public License (LGPL) + +feclib version 2.1 June 5 2006 +Added error checking, fixed alignment bug in SSE2 versions of Viterbi decoders causing segfaults + +feclib version 2.1.1 June 6 2006 +Fix test/benchmark time measurement on Linux diff --git a/libfec/README.x86-64 b/libfec/README.x86-64 new file mode 100644 index 0000000..bb4450c --- /dev/null +++ b/libfec/README.x86-64 @@ -0,0 +1,13 @@ +This library has been modified to compile natively on x86-64. + +An attempt has been made to adapt the assembly code, but due to unsolved issues with +the fact that shared libraries on x86-64 have to be compiled with PIC, this approach is +not finished. + +This code therefore only uses the portable C implementation, which is certainly slower than +the assembly SSE2 that could ideally be used. + +It could be said that we trade performance against the possibility to compile on x86-64. + +feb, 2012 +Matthias P. Braendli, HB9EGM diff --git a/libfec/bootstrap b/libfec/bootstrap new file mode 100755 index 0000000..2f58d5c --- /dev/null +++ b/libfec/bootstrap @@ -0,0 +1,6 @@ +#!/bin/bash + +aclocal && \ +autoheader && \ +autoconf + diff --git a/libfec/ccsds.h b/libfec/ccsds.h new file mode 100644 index 0000000..ae65468 --- /dev/null +++ b/libfec/ccsds.h @@ -0,0 +1,5 @@ +typedef unsigned char data_t; +extern unsigned char Taltab[],Tal1tab[]; +#define NN 255 +#define NROOTS 32 + diff --git a/libfec/char.h b/libfec/char.h new file mode 100644 index 0000000..25efd65 --- /dev/null +++ b/libfec/char.h @@ -0,0 +1,24 @@ +/* Stuff specific to the 8-bit symbol version of the general purpose RS codecs + * + * Copyright 2003, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +typedef unsigned char data_t; + +#define MODNN(x) modnn(rs,x) + +#define MM (rs->mm) +#define NN (rs->nn) +#define ALPHA_TO (rs->alpha_to) +#define INDEX_OF (rs->index_of) +#define GENPOLY (rs->genpoly) +#define NROOTS (rs->nroots) +#define FCR (rs->fcr) +#define PRIM (rs->prim) +#define IPRIM (rs->iprim) +#define PAD (rs->pad) +#define A0 (NN) + + + + diff --git a/libfec/cmake/Modules/Version.cmake b/libfec/cmake/Modules/Version.cmake new file mode 100644 index 0000000..e8d5bd5 --- /dev/null +++ b/libfec/cmake/Modules/Version.cmake @@ -0,0 +1,115 @@ +# Portions of this file have been borrowed from and/or inspired by +# the Version.cmake from the rtl-sdr project. +# http://sdr.osmocom.org/trac/wiki/rtl-sdr +# +# Provides: +# ${VERSION_INFO_BASE} - Major.Minor.Patch +# ${VERSION_INFO} - Major.minor.Patch[-git_info] +# +# Requires values for: +# ${VERSION_INFO_MAJOR} - Increment on API compatibility changes. +# ${VERSION_INFO_MINOR} - Increment when adding features. +# ${VERSION_INFO_PATCH} - Increment for bug and documentation changes. +# +# Optional: +# ${VERSION_INFO_EXTRA} - Set to "git" to append git info. This is +# intended only for non-versioned development +# builds +# ${VERSION_INFO_OVERRIDE} - Set to a non-null value to override the +# VERSION_INFO_EXTRA logic. This is intended +# for automated snapshot builds from exported +# trees, to pass in the git revision info. +# +if(DEFINED __INCLUDED_TOOLAME-DAB_VERSION_CMAKE) + return() +endif() +set(__INCLUDED_TOOLAME-DAB_VERSION_CMAKE TRUE) + +################################################################################ +# Gather up variables provided by parent script +################################################################################ + +if(NOT DEFINED VERSION_INFO_MAJOR) + message(FATAL_ERROR "VERSION_INFO_MAJOR is not defined") +else() + set(VER_MAJ ${VERSION_INFO_MAJOR}) +endif() + +if(NOT DEFINED VERSION_INFO_MINOR) + message(FATAL_ERROR "VERSION_INFO_MINOR is not defined") +else() + set(VER_MIN ${VERSION_INFO_MINOR}) +endif() + +if(NOT DEFINED VERSION_INFO_PATCH) + message(FATAL_ERROR "VERSION_INFO_PATCH is not defined") +else() + set(VER_PAT ${VERSION_INFO_PATCH}) +endif() + + +################################################################################ +# Craft version number, using git, if needed +################################################################################ +find_package(Git QUIET) + +if(GIT_FOUND) + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse -- + ERROR_QUIET + RESULT_VARIABLE NOT_GIT_REPOSITORY + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + ) + + if(NOT_GIT_REPOSITORY) + set(GIT_INFO "-unknown") + else() + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD -- + OUTPUT_VARIABLE GIT_REV OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + ) + + execute_process( + COMMAND ${GIT_EXECUTABLE} diff-index --quiet HEAD -- + RESULT_VARIABLE GIT_DIRTY + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + ) + + if(GIT_DIRTY) + set(GIT_INFO "-${GIT_REV}-dirty") + else() + set(GIT_INFO "-${GIT_REV}") + endif() + endif() + +else() + message(WARNING "git missing -- unable to check libfec version.") + unset(NOT_GIT_REPOSITORY) + unset(GIT_REV) + unset(GIT_DIRTY) +endif() + + +################################################################################ +# Provide +################################################################################ +set(VERSION_INFO_BASE "${VER_MAJ}.${VER_MIN}.${VER_PAT}") + +# Force the version suffix. Used for automated export builds. +if(VERSION_INFO_OVERRIDE) + set(VERSION_INFO "${VERSION_INFO_BASE}-${VERSION_INFO_OVERRIDE}") + +# Intra-release builds +elseif("${VERSION_INFO_EXTRA}" STREQUAL "git") + set(VERSION_INFO "${VERSION_INFO_BASE}-git${GIT_INFO}") + +# Versioned releases +elseif("${VERSION_INFO_EXTRA}" STREQUAL "") + set(VERSION_INFO "${VERSION_INFO_BASE}") + +# Invalid +else() + message(FATAL_ERROR + "Unexpected definition of VERSION_INFO_EXTRA: ${VERSION_INFO_EXTRA}") +endif() diff --git a/libfec/cmake/cmake_uninstall.cmake.in b/libfec/cmake/cmake_uninstall.cmake.in new file mode 100644 index 0000000..2037e36 --- /dev/null +++ b/libfec/cmake/cmake_uninstall.cmake.in @@ -0,0 +1,21 @@ +if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") + message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") +endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") + +file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files) +string(REGEX REPLACE "\n" ";" files "${files}") +foreach(file ${files}) + message(STATUS "Uninstalling $ENV{DESTDIR}${file}") + if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") + exec_program( + "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\"" + OUTPUT_VARIABLE rm_out + RETURN_VALUE rm_retval + ) + if(NOT "${rm_retval}" STREQUAL 0) + message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}") + endif(NOT "${rm_retval}" STREQUAL 0) + else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") + message(STATUS "File $ENV{DESTDIR}${file} does not exist.") + endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") +endforeach(file) diff --git a/libfec/config.guess b/libfec/config.guess new file mode 100644 index 0000000..0f0fe71 --- /dev/null +++ b/libfec/config.guess @@ -0,0 +1,1516 @@ +#! /bin/sh +# Attempt to guess a canonical system name. +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, +# 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation, +# Inc. + +timestamp='2007-03-06' + +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA +# 02110-1301, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + + +# Originally written by Per Bothner . +# Please send patches to . Submit a context +# diff and a properly formatted ChangeLog entry. +# +# This script attempts to guess a canonical system name similar to +# config.sub. If it succeeds, it prints the system name on stdout, and +# exits with 0. Otherwise, it exits with 1. +# +# The plan is that this can be called by configure scripts if you +# don't specify an explicit build system type. + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] + +Output the configuration name of the system \`$me' is run on. + +Operation modes: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.guess ($timestamp) + +Originally written by Per Bothner. +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005 +Free Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try \`$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit ;; + --version | -v ) + echo "$version" ; exit ;; + --help | --h* | -h ) + echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" >&2 + exit 1 ;; + * ) + break ;; + esac +done + +if test $# != 0; then + echo "$me: too many arguments$help" >&2 + exit 1 +fi + +trap 'exit 1' 1 2 15 + +# CC_FOR_BUILD -- compiler used by this script. Note that the use of a +# compiler to aid in system detection is discouraged as it requires +# temporary files to be created and, as you can see below, it is a +# headache to deal with in a portable fashion. + +# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still +# use `HOST_CC' if defined, but it is deprecated. + +# Portable tmp directory creation inspired by the Autoconf team. + +set_cc_for_build=' +trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; +trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; +: ${TMPDIR=/tmp} ; + { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || + { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || + { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || + { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; +dummy=$tmp/dummy ; +tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; +case $CC_FOR_BUILD,$HOST_CC,$CC in + ,,) echo "int x;" > $dummy.c ; + for c in cc gcc c89 c99 ; do + if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then + CC_FOR_BUILD="$c"; break ; + fi ; + done ; + if test x"$CC_FOR_BUILD" = x ; then + CC_FOR_BUILD=no_compiler_found ; + fi + ;; + ,,*) CC_FOR_BUILD=$CC ;; + ,*,*) CC_FOR_BUILD=$HOST_CC ;; +esac ; set_cc_for_build= ;' + +# This is needed to find uname on a Pyramid OSx when run in the BSD universe. +# (ghazi@noc.rutgers.edu 1994-08-24) +if (test -f /.attbin/uname) >/dev/null 2>&1 ; then + PATH=$PATH:/.attbin ; export PATH +fi + +UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown +UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown +UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown +UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown + +# Note: order is significant - the case branches are not exclusive. + +case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in + *:NetBSD:*:*) + # NetBSD (nbsd) targets should (where applicable) match one or + # more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*, + # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently + # switched to ELF, *-*-netbsd* would select the old + # object file format. This provides both forward + # compatibility and a consistent mechanism for selecting the + # object file format. + # + # Note: NetBSD doesn't particularly care about the vendor + # portion of the name. We always set it to "unknown". + sysctl="sysctl -n hw.machine_arch" + UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ + /usr/sbin/$sysctl 2>/dev/null || echo unknown)` + case "${UNAME_MACHINE_ARCH}" in + armeb) machine=armeb-unknown ;; + arm*) machine=arm-unknown ;; + sh3el) machine=shl-unknown ;; + sh3eb) machine=sh-unknown ;; + sh5el) machine=sh5le-unknown ;; + *) machine=${UNAME_MACHINE_ARCH}-unknown ;; + esac + # The Operating System including object format, if it has switched + # to ELF recently, or will in the future. + case "${UNAME_MACHINE_ARCH}" in + arm*|i386|m68k|ns32k|sh3*|sparc|vax) + eval $set_cc_for_build + if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep __ELF__ >/dev/null + then + # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). + # Return netbsd for either. FIX? + os=netbsd + else + os=netbsdelf + fi + ;; + *) + os=netbsd + ;; + esac + # The OS release + # Debian GNU/NetBSD machines have a different userland, and + # thus, need a distinct triplet. However, they do not need + # kernel version information, so it can be replaced with a + # suitable tag, in the style of linux-gnu. + case "${UNAME_VERSION}" in + Debian*) + release='-gnu' + ;; + *) + release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` + ;; + esac + # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: + # contains redundant information, the shorter form: + # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. + echo "${machine}-${os}${release}" + exit ;; + *:OpenBSD:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` + echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} + exit ;; + *:ekkoBSD:*:*) + echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} + exit ;; + *:SolidBSD:*:*) + echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE} + exit ;; + macppc:MirBSD:*:*) + echo powerpc-unknown-mirbsd${UNAME_RELEASE} + exit ;; + *:MirBSD:*:*) + echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} + exit ;; + alpha:OSF1:*:*) + case $UNAME_RELEASE in + *4.0) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` + ;; + *5.*) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` + ;; + esac + # According to Compaq, /usr/sbin/psrinfo has been available on + # OSF/1 and Tru64 systems produced since 1995. I hope that + # covers most systems running today. This code pipes the CPU + # types through head -n 1, so we only detect the type of CPU 0. + ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` + case "$ALPHA_CPU_TYPE" in + "EV4 (21064)") + UNAME_MACHINE="alpha" ;; + "EV4.5 (21064)") + UNAME_MACHINE="alpha" ;; + "LCA4 (21066/21068)") + UNAME_MACHINE="alpha" ;; + "EV5 (21164)") + UNAME_MACHINE="alphaev5" ;; + "EV5.6 (21164A)") + UNAME_MACHINE="alphaev56" ;; + "EV5.6 (21164PC)") + UNAME_MACHINE="alphapca56" ;; + "EV5.7 (21164PC)") + UNAME_MACHINE="alphapca57" ;; + "EV6 (21264)") + UNAME_MACHINE="alphaev6" ;; + "EV6.7 (21264A)") + UNAME_MACHINE="alphaev67" ;; + "EV6.8CB (21264C)") + UNAME_MACHINE="alphaev68" ;; + "EV6.8AL (21264B)") + UNAME_MACHINE="alphaev68" ;; + "EV6.8CX (21264D)") + UNAME_MACHINE="alphaev68" ;; + "EV6.9A (21264/EV69A)") + UNAME_MACHINE="alphaev69" ;; + "EV7 (21364)") + UNAME_MACHINE="alphaev7" ;; + "EV7.9 (21364A)") + UNAME_MACHINE="alphaev79" ;; + esac + # A Pn.n version is a patched version. + # A Vn.n version is a released version. + # A Tn.n version is a released field test version. + # A Xn.n version is an unreleased experimental baselevel. + # 1.2 uses "1.2" for uname -r. + echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + exit ;; + Alpha\ *:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # Should we change UNAME_MACHINE based on the output of uname instead + # of the specific Alpha model? + echo alpha-pc-interix + exit ;; + 21064:Windows_NT:50:3) + echo alpha-dec-winnt3.5 + exit ;; + Amiga*:UNIX_System_V:4.0:*) + echo m68k-unknown-sysv4 + exit ;; + *:[Aa]miga[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-amigaos + exit ;; + *:[Mm]orph[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-morphos + exit ;; + *:OS/390:*:*) + echo i370-ibm-openedition + exit ;; + *:z/VM:*:*) + echo s390-ibm-zvmoe + exit ;; + *:OS400:*:*) + echo powerpc-ibm-os400 + exit ;; + arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) + echo arm-acorn-riscix${UNAME_RELEASE} + exit ;; + arm:riscos:*:*|arm:RISCOS:*:*) + echo arm-unknown-riscos + exit ;; + SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) + echo hppa1.1-hitachi-hiuxmpp + exit ;; + Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) + # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. + if test "`(/bin/universe) 2>/dev/null`" = att ; then + echo pyramid-pyramid-sysv3 + else + echo pyramid-pyramid-bsd + fi + exit ;; + NILE*:*:*:dcosx) + echo pyramid-pyramid-svr4 + exit ;; + DRS?6000:unix:4.0:6*) + echo sparc-icl-nx6 + exit ;; + DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) + case `/usr/bin/uname -p` in + sparc) echo sparc-icl-nx7; exit ;; + esac ;; + sun4H:SunOS:5.*:*) + echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) + echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + i86pc:SunOS:5.*:*) + echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:6*:*) + # According to config.sub, this is the proper way to canonicalize + # SunOS6. Hard to guess exactly what SunOS6 will be like, but + # it's likely to be more like Solaris than SunOS4. + echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:*:*) + case "`/usr/bin/arch -k`" in + Series*|S4*) + UNAME_RELEASE=`uname -v` + ;; + esac + # Japanese Language versions have a version number like `4.1.3-JL'. + echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` + exit ;; + sun3*:SunOS:*:*) + echo m68k-sun-sunos${UNAME_RELEASE} + exit ;; + sun*:*:4.2BSD:*) + UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` + test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 + case "`/bin/arch`" in + sun3) + echo m68k-sun-sunos${UNAME_RELEASE} + ;; + sun4) + echo sparc-sun-sunos${UNAME_RELEASE} + ;; + esac + exit ;; + aushp:SunOS:*:*) + echo sparc-auspex-sunos${UNAME_RELEASE} + exit ;; + # The situation for MiNT is a little confusing. The machine name + # can be virtually everything (everything which is not + # "atarist" or "atariste" at least should have a processor + # > m68000). The system name ranges from "MiNT" over "FreeMiNT" + # to the lowercase version "mint" (or "freemint"). Finally + # the system name "TOS" denotes a system which is actually not + # MiNT. But MiNT is downward compatible to TOS, so this should + # be no problem. + atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) + echo m68k-milan-mint${UNAME_RELEASE} + exit ;; + hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) + echo m68k-hades-mint${UNAME_RELEASE} + exit ;; + *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) + echo m68k-unknown-mint${UNAME_RELEASE} + exit ;; + m68k:machten:*:*) + echo m68k-apple-machten${UNAME_RELEASE} + exit ;; + powerpc:machten:*:*) + echo powerpc-apple-machten${UNAME_RELEASE} + exit ;; + RISC*:Mach:*:*) + echo mips-dec-mach_bsd4.3 + exit ;; + RISC*:ULTRIX:*:*) + echo mips-dec-ultrix${UNAME_RELEASE} + exit ;; + VAX*:ULTRIX*:*:*) + echo vax-dec-ultrix${UNAME_RELEASE} + exit ;; + 2020:CLIX:*:* | 2430:CLIX:*:*) + echo clipper-intergraph-clix${UNAME_RELEASE} + exit ;; + mips:*:*:UMIPS | mips:*:*:RISCos) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c +#ifdef __cplusplus +#include /* for printf() prototype */ + int main (int argc, char *argv[]) { +#else + int main (argc, argv) int argc; char *argv[]; { +#endif + #if defined (host_mips) && defined (MIPSEB) + #if defined (SYSTYPE_SYSV) + printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_SVR4) + printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) + printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); + #endif + #endif + exit (-1); + } +EOF + $CC_FOR_BUILD -o $dummy $dummy.c && + dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && + SYSTEM_NAME=`$dummy $dummyarg` && + { echo "$SYSTEM_NAME"; exit; } + echo mips-mips-riscos${UNAME_RELEASE} + exit ;; + Motorola:PowerMAX_OS:*:*) + echo powerpc-motorola-powermax + exit ;; + Motorola:*:4.3:PL8-*) + echo powerpc-harris-powermax + exit ;; + Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) + echo powerpc-harris-powermax + exit ;; + Night_Hawk:Power_UNIX:*:*) + echo powerpc-harris-powerunix + exit ;; + m88k:CX/UX:7*:*) + echo m88k-harris-cxux7 + exit ;; + m88k:*:4*:R4*) + echo m88k-motorola-sysv4 + exit ;; + m88k:*:3*:R3*) + echo m88k-motorola-sysv3 + exit ;; + AViiON:dgux:*:*) + # DG/UX returns AViiON for all architectures + UNAME_PROCESSOR=`/usr/bin/uname -p` + if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] + then + if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ + [ ${TARGET_BINARY_INTERFACE}x = x ] + then + echo m88k-dg-dgux${UNAME_RELEASE} + else + echo m88k-dg-dguxbcs${UNAME_RELEASE} + fi + else + echo i586-dg-dgux${UNAME_RELEASE} + fi + exit ;; + M88*:DolphinOS:*:*) # DolphinOS (SVR3) + echo m88k-dolphin-sysv3 + exit ;; + M88*:*:R3*:*) + # Delta 88k system running SVR3 + echo m88k-motorola-sysv3 + exit ;; + XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) + echo m88k-tektronix-sysv3 + exit ;; + Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) + echo m68k-tektronix-bsd + exit ;; + *:IRIX*:*:*) + echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` + exit ;; + ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. + echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id + exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' + i*86:AIX:*:*) + echo i386-ibm-aix + exit ;; + ia64:AIX:*:*) + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} + exit ;; + *:AIX:2:3) + if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include + + main() + { + if (!__power_pc()) + exit(1); + puts("powerpc-ibm-aix3.2.5"); + exit(0); + } +EOF + if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` + then + echo "$SYSTEM_NAME" + else + echo rs6000-ibm-aix3.2.5 + fi + elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then + echo rs6000-ibm-aix3.2.4 + else + echo rs6000-ibm-aix3.2 + fi + exit ;; + *:AIX:*:[45]) + IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` + if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then + IBM_ARCH=rs6000 + else + IBM_ARCH=powerpc + fi + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${IBM_ARCH}-ibm-aix${IBM_REV} + exit ;; + *:AIX:*:*) + echo rs6000-ibm-aix + exit ;; + ibmrt:4.4BSD:*|romp-ibm:BSD:*) + echo romp-ibm-bsd4.4 + exit ;; + ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and + echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to + exit ;; # report: romp-ibm BSD 4.3 + *:BOSX:*:*) + echo rs6000-bull-bosx + exit ;; + DPX/2?00:B.O.S.:*:*) + echo m68k-bull-sysv3 + exit ;; + 9000/[34]??:4.3bsd:1.*:*) + echo m68k-hp-bsd + exit ;; + hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) + echo m68k-hp-bsd4.4 + exit ;; + 9000/[34678]??:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + case "${UNAME_MACHINE}" in + 9000/31? ) HP_ARCH=m68000 ;; + 9000/[34]?? ) HP_ARCH=m68k ;; + 9000/[678][0-9][0-9]) + if [ -x /usr/bin/getconf ]; then + sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` + sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` + case "${sc_cpu_version}" in + 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 + 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 + 532) # CPU_PA_RISC2_0 + case "${sc_kernel_bits}" in + 32) HP_ARCH="hppa2.0n" ;; + 64) HP_ARCH="hppa2.0w" ;; + '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 + esac ;; + esac + fi + if [ "${HP_ARCH}" = "" ]; then + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + + #define _HPUX_SOURCE + #include + #include + + int main () + { + #if defined(_SC_KERNEL_BITS) + long bits = sysconf(_SC_KERNEL_BITS); + #endif + long cpu = sysconf (_SC_CPU_VERSION); + + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1"); break; + case CPU_PA_RISC2_0: + #if defined(_SC_KERNEL_BITS) + switch (bits) + { + case 64: puts ("hppa2.0w"); break; + case 32: puts ("hppa2.0n"); break; + default: puts ("hppa2.0"); break; + } break; + #else /* !defined(_SC_KERNEL_BITS) */ + puts ("hppa2.0"); break; + #endif + default: puts ("hppa1.0"); break; + } + exit (0); + } +EOF + (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` + test -z "$HP_ARCH" && HP_ARCH=hppa + fi ;; + esac + if [ ${HP_ARCH} = "hppa2.0w" ] + then + eval $set_cc_for_build + + # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating + # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler + # generating 64-bit code. GNU and HP use different nomenclature: + # + # $ CC_FOR_BUILD=cc ./config.guess + # => hppa2.0w-hp-hpux11.23 + # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess + # => hppa64-hp-hpux11.23 + + if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | + grep __LP64__ >/dev/null + then + HP_ARCH="hppa2.0w" + else + HP_ARCH="hppa64" + fi + fi + echo ${HP_ARCH}-hp-hpux${HPUX_REV} + exit ;; + ia64:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + echo ia64-hp-hpux${HPUX_REV} + exit ;; + 3050*:HI-UX:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include + int + main () + { + long cpu = sysconf (_SC_CPU_VERSION); + /* The order matters, because CPU_IS_HP_MC68K erroneously returns + true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct + results, however. */ + if (CPU_IS_PA_RISC (cpu)) + { + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; + case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; + default: puts ("hppa-hitachi-hiuxwe2"); break; + } + } + else if (CPU_IS_HP_MC68K (cpu)) + puts ("m68k-hitachi-hiuxwe2"); + else puts ("unknown-hitachi-hiuxwe2"); + exit (0); + } +EOF + $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && + { echo "$SYSTEM_NAME"; exit; } + echo unknown-hitachi-hiuxwe2 + exit ;; + 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) + echo hppa1.1-hp-bsd + exit ;; + 9000/8??:4.3bsd:*:*) + echo hppa1.0-hp-bsd + exit ;; + *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) + echo hppa1.0-hp-mpeix + exit ;; + hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) + echo hppa1.1-hp-osf + exit ;; + hp8??:OSF1:*:*) + echo hppa1.0-hp-osf + exit ;; + i*86:OSF1:*:*) + if [ -x /usr/sbin/sysversion ] ; then + echo ${UNAME_MACHINE}-unknown-osf1mk + else + echo ${UNAME_MACHINE}-unknown-osf1 + fi + exit ;; + parisc*:Lites*:*:*) + echo hppa1.1-hp-lites + exit ;; + C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) + echo c1-convex-bsd + exit ;; + C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit ;; + C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) + echo c34-convex-bsd + exit ;; + C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) + echo c38-convex-bsd + exit ;; + C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) + echo c4-convex-bsd + exit ;; + CRAY*Y-MP:*:*:*) + echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*[A-Z]90:*:*:*) + echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ + | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ + -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ + -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*TS:*:*:*) + echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*T3E:*:*:*) + echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*SV1:*:*:*) + echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + *:UNICOS/mp:*:*) + echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) + FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` + echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit ;; + 5000:UNIX_System_V:4.*:*) + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` + echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit ;; + i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) + echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} + exit ;; + sparc*:BSD/OS:*:*) + echo sparc-unknown-bsdi${UNAME_RELEASE} + exit ;; + *:BSD/OS:*:*) + echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} + exit ;; + *:FreeBSD:*:*) + case ${UNAME_MACHINE} in + pc98) + echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + amd64) + echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + *) + echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + esac + exit ;; + i*:CYGWIN*:*) + echo ${UNAME_MACHINE}-pc-cygwin + exit ;; + *:MINGW*:*) + echo ${UNAME_MACHINE}-pc-mingw32 + exit ;; + i*:windows32*:*) + # uname -m includes "-pc" on this system. + echo ${UNAME_MACHINE}-mingw32 + exit ;; + i*:PW*:*) + echo ${UNAME_MACHINE}-pc-pw32 + exit ;; + *:Interix*:[3456]*) + case ${UNAME_MACHINE} in + x86) + echo i586-pc-interix${UNAME_RELEASE} + exit ;; + EM64T | authenticamd) + echo x86_64-unknown-interix${UNAME_RELEASE} + exit ;; + esac ;; + [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) + echo i${UNAME_MACHINE}-pc-mks + exit ;; + i*:Windows_NT*:* | Pentium*:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we + # UNAME_MACHINE based on the output of uname instead of i386? + echo i586-pc-interix + exit ;; + i*:UWIN*:*) + echo ${UNAME_MACHINE}-pc-uwin + exit ;; + amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) + echo x86_64-unknown-cygwin + exit ;; + p*:CYGWIN*:*) + echo powerpcle-unknown-cygwin + exit ;; + prep*:SunOS:5.*:*) + echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + *:GNU:*:*) + # the GNU system + echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` + exit ;; + *:GNU/*:*:*) + # other systems with GNU libc and userland + echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu + exit ;; + i*86:Minix:*:*) + echo ${UNAME_MACHINE}-pc-minix + exit ;; + arm*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + avr32*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + cris:Linux:*:*) + echo cris-axis-linux-gnu + exit ;; + crisv32:Linux:*:*) + echo crisv32-axis-linux-gnu + exit ;; + frv:Linux:*:*) + echo frv-unknown-linux-gnu + exit ;; + ia64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + m32r*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + m68*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + mips:Linux:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #undef CPU + #undef mips + #undef mipsel + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) + CPU=mipsel + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) + CPU=mips + #else + CPU= + #endif + #endif +EOF + eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' + /^CPU/{ + s: ::g + p + }'`" + test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } + ;; + mips64:Linux:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #undef CPU + #undef mips64 + #undef mips64el + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) + CPU=mips64el + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) + CPU=mips64 + #else + CPU= + #endif + #endif +EOF + eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' + /^CPU/{ + s: ::g + p + }'`" + test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } + ;; + or32:Linux:*:*) + echo or32-unknown-linux-gnu + exit ;; + ppc:Linux:*:*) + echo powerpc-unknown-linux-gnu + exit ;; + ppc64:Linux:*:*) + echo powerpc64-unknown-linux-gnu + exit ;; + alpha:Linux:*:*) + case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in + EV5) UNAME_MACHINE=alphaev5 ;; + EV56) UNAME_MACHINE=alphaev56 ;; + PCA56) UNAME_MACHINE=alphapca56 ;; + PCA57) UNAME_MACHINE=alphapca56 ;; + EV6) UNAME_MACHINE=alphaev6 ;; + EV67) UNAME_MACHINE=alphaev67 ;; + EV68*) UNAME_MACHINE=alphaev68 ;; + esac + objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null + if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi + echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} + exit ;; + parisc:Linux:*:* | hppa:Linux:*:*) + # Look for CPU level + case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in + PA7*) echo hppa1.1-unknown-linux-gnu ;; + PA8*) echo hppa2.0-unknown-linux-gnu ;; + *) echo hppa-unknown-linux-gnu ;; + esac + exit ;; + parisc64:Linux:*:* | hppa64:Linux:*:*) + echo hppa64-unknown-linux-gnu + exit ;; + s390:Linux:*:* | s390x:Linux:*:*) + echo ${UNAME_MACHINE}-ibm-linux + exit ;; + sh64*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + sh*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + sparc:Linux:*:* | sparc64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + vax:Linux:*:*) + echo ${UNAME_MACHINE}-dec-linux-gnu + exit ;; + x86_64:Linux:*:*) + echo x86_64-unknown-linux-gnu + exit ;; + xtensa:Linux:*:*) + echo xtensa-unknown-linux-gnu + exit ;; + i*86:Linux:*:*) + # The BFD linker knows what the default object file format is, so + # first see if it will tell us. cd to the root directory to prevent + # problems with other programs or directories called `ld' in the path. + # Set LC_ALL=C to ensure ld outputs messages in English. + ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \ + | sed -ne '/supported targets:/!d + s/[ ][ ]*/ /g + s/.*supported targets: *// + s/ .*// + p'` + case "$ld_supported_targets" in + elf32-i386) + TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu" + ;; + a.out-i386-linux) + echo "${UNAME_MACHINE}-pc-linux-gnuaout" + exit ;; + coff-i386) + echo "${UNAME_MACHINE}-pc-linux-gnucoff" + exit ;; + "") + # Either a pre-BFD a.out linker (linux-gnuoldld) or + # one that does not give us useful --help. + echo "${UNAME_MACHINE}-pc-linux-gnuoldld" + exit ;; + esac + # Determine whether the default compiler is a.out or elf + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include + #ifdef __ELF__ + # ifdef __GLIBC__ + # if __GLIBC__ >= 2 + LIBC=gnu + # else + LIBC=gnulibc1 + # endif + # else + LIBC=gnulibc1 + # endif + #else + #if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) + LIBC=gnu + #else + LIBC=gnuaout + #endif + #endif + #ifdef __dietlibc__ + LIBC=dietlibc + #endif +EOF + eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' + /^LIBC/{ + s: ::g + p + }'`" + test x"${LIBC}" != x && { + echo "${UNAME_MACHINE}-pc-linux-${LIBC}" + exit + } + test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; } + ;; + i*86:DYNIX/ptx:4*:*) + # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. + # earlier versions are messed up and put the nodename in both + # sysname and nodename. + echo i386-sequent-sysv4 + exit ;; + i*86:UNIX_SV:4.2MP:2.*) + # Unixware is an offshoot of SVR4, but it has its own version + # number series starting with 2... + # I am not positive that other SVR4 systems won't match this, + # I just have to hope. -- rms. + # Use sysv4.2uw... so that sysv4* matches it. + echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} + exit ;; + i*86:OS/2:*:*) + # If we were able to find `uname', then EMX Unix compatibility + # is probably installed. + echo ${UNAME_MACHINE}-pc-os2-emx + exit ;; + i*86:XTS-300:*:STOP) + echo ${UNAME_MACHINE}-unknown-stop + exit ;; + i*86:atheos:*:*) + echo ${UNAME_MACHINE}-unknown-atheos + exit ;; + i*86:syllable:*:*) + echo ${UNAME_MACHINE}-pc-syllable + exit ;; + i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*) + echo i386-unknown-lynxos${UNAME_RELEASE} + exit ;; + i*86:*DOS:*:*) + echo ${UNAME_MACHINE}-pc-msdosdjgpp + exit ;; + i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) + UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` + if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then + echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} + else + echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} + fi + exit ;; + i*86:*:5:[678]*) + # UnixWare 7.x, OpenUNIX and OpenServer 6. + case `/bin/uname -X | grep "^Machine"` in + *486*) UNAME_MACHINE=i486 ;; + *Pentium) UNAME_MACHINE=i586 ;; + *Pent*|*Celeron) UNAME_MACHINE=i686 ;; + esac + echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} + exit ;; + i*86:*:3.2:*) + if test -f /usr/options/cb.name; then + UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then + UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` + (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 + (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ + && UNAME_MACHINE=i586 + (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ + && UNAME_MACHINE=i686 + (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ + && UNAME_MACHINE=i686 + echo ${UNAME_MACHINE}-pc-sco$UNAME_REL + else + echo ${UNAME_MACHINE}-pc-sysv32 + fi + exit ;; + pc:*:*:*) + # Left here for compatibility: + # uname -m prints for DJGPP always 'pc', but it prints nothing about + # the processor, so we play safe by assuming i386. + echo i386-pc-msdosdjgpp + exit ;; + Intel:Mach:3*:*) + echo i386-pc-mach3 + exit ;; + paragon:*:*:*) + echo i860-intel-osf1 + exit ;; + i860:*:4.*:*) # i860-SVR4 + if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then + echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 + else # Add other i860-SVR4 vendors below as they are discovered. + echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 + fi + exit ;; + mini*:CTIX:SYS*5:*) + # "miniframe" + echo m68010-convergent-sysv + exit ;; + mc68k:UNIX:SYSTEM5:3.51m) + echo m68k-convergent-sysv + exit ;; + M680?0:D-NIX:5.3:*) + echo m68k-diab-dnix + exit ;; + M68*:*:R3V[5678]*:*) + test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; + 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) + OS_REL='' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4.3${OS_REL}; exit; } + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ + && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; + 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4; exit; } ;; + m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) + echo m68k-unknown-lynxos${UNAME_RELEASE} + exit ;; + mc68030:UNIX_System_V:4.*:*) + echo m68k-atari-sysv4 + exit ;; + TSUNAMI:LynxOS:2.*:*) + echo sparc-unknown-lynxos${UNAME_RELEASE} + exit ;; + rs6000:LynxOS:2.*:*) + echo rs6000-unknown-lynxos${UNAME_RELEASE} + exit ;; + PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*) + echo powerpc-unknown-lynxos${UNAME_RELEASE} + exit ;; + SM[BE]S:UNIX_SV:*:*) + echo mips-dde-sysv${UNAME_RELEASE} + exit ;; + RM*:ReliantUNIX-*:*:*) + echo mips-sni-sysv4 + exit ;; + RM*:SINIX-*:*:*) + echo mips-sni-sysv4 + exit ;; + *:SINIX-*:*:*) + if uname -p 2>/dev/null >/dev/null ; then + UNAME_MACHINE=`(uname -p) 2>/dev/null` + echo ${UNAME_MACHINE}-sni-sysv4 + else + echo ns32k-sni-sysv + fi + exit ;; + PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort + # says + echo i586-unisys-sysv4 + exit ;; + *:UNIX_System_V:4*:FTX*) + # From Gerald Hewes . + # How about differentiating between stratus architectures? -djm + echo hppa1.1-stratus-sysv4 + exit ;; + *:*:*:FTX*) + # From seanf@swdc.stratus.com. + echo i860-stratus-sysv4 + exit ;; + i*86:VOS:*:*) + # From Paul.Green@stratus.com. + echo ${UNAME_MACHINE}-stratus-vos + exit ;; + *:VOS:*:*) + # From Paul.Green@stratus.com. + echo hppa1.1-stratus-vos + exit ;; + mc68*:A/UX:*:*) + echo m68k-apple-aux${UNAME_RELEASE} + exit ;; + news*:NEWS-OS:6*:*) + echo mips-sony-newsos6 + exit ;; + R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) + if [ -d /usr/nec ]; then + echo mips-nec-sysv${UNAME_RELEASE} + else + echo mips-unknown-sysv${UNAME_RELEASE} + fi + exit ;; + BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. + echo powerpc-be-beos + exit ;; + BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. + echo powerpc-apple-beos + exit ;; + BePC:BeOS:*:*) # BeOS running on Intel PC compatible. + echo i586-pc-beos + exit ;; + SX-4:SUPER-UX:*:*) + echo sx4-nec-superux${UNAME_RELEASE} + exit ;; + SX-5:SUPER-UX:*:*) + echo sx5-nec-superux${UNAME_RELEASE} + exit ;; + SX-6:SUPER-UX:*:*) + echo sx6-nec-superux${UNAME_RELEASE} + exit ;; + SX-7:SUPER-UX:*:*) + echo sx7-nec-superux${UNAME_RELEASE} + exit ;; + SX-8:SUPER-UX:*:*) + echo sx8-nec-superux${UNAME_RELEASE} + exit ;; + SX-8R:SUPER-UX:*:*) + echo sx8r-nec-superux${UNAME_RELEASE} + exit ;; + Power*:Rhapsody:*:*) + echo powerpc-apple-rhapsody${UNAME_RELEASE} + exit ;; + *:Rhapsody:*:*) + echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} + exit ;; + *:Darwin:*:*) + UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown + case $UNAME_PROCESSOR in + unknown) UNAME_PROCESSOR=powerpc ;; + esac + echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} + exit ;; + *:procnto*:*:* | *:QNX:[0123456789]*:*) + UNAME_PROCESSOR=`uname -p` + if test "$UNAME_PROCESSOR" = "x86"; then + UNAME_PROCESSOR=i386 + UNAME_MACHINE=pc + fi + echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} + exit ;; + *:QNX:*:4*) + echo i386-pc-qnx + exit ;; + NSE-?:NONSTOP_KERNEL:*:*) + echo nse-tandem-nsk${UNAME_RELEASE} + exit ;; + NSR-?:NONSTOP_KERNEL:*:*) + echo nsr-tandem-nsk${UNAME_RELEASE} + exit ;; + *:NonStop-UX:*:*) + echo mips-compaq-nonstopux + exit ;; + BS2000:POSIX*:*:*) + echo bs2000-siemens-sysv + exit ;; + DS/*:UNIX_System_V:*:*) + echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} + exit ;; + *:Plan9:*:*) + # "uname -m" is not consistent, so use $cputype instead. 386 + # is converted to i386 for consistency with other x86 + # operating systems. + if test "$cputype" = "386"; then + UNAME_MACHINE=i386 + else + UNAME_MACHINE="$cputype" + fi + echo ${UNAME_MACHINE}-unknown-plan9 + exit ;; + *:TOPS-10:*:*) + echo pdp10-unknown-tops10 + exit ;; + *:TENEX:*:*) + echo pdp10-unknown-tenex + exit ;; + KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) + echo pdp10-dec-tops20 + exit ;; + XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) + echo pdp10-xkl-tops20 + exit ;; + *:TOPS-20:*:*) + echo pdp10-unknown-tops20 + exit ;; + *:ITS:*:*) + echo pdp10-unknown-its + exit ;; + SEI:*:*:SEIUX) + echo mips-sei-seiux${UNAME_RELEASE} + exit ;; + *:DragonFly:*:*) + echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` + exit ;; + *:*VMS:*:*) + UNAME_MACHINE=`(uname -p) 2>/dev/null` + case "${UNAME_MACHINE}" in + A*) echo alpha-dec-vms ; exit ;; + I*) echo ia64-dec-vms ; exit ;; + V*) echo vax-dec-vms ; exit ;; + esac ;; + *:XENIX:*:SysV) + echo i386-pc-xenix + exit ;; + i*86:skyos:*:*) + echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' + exit ;; + i*86:rdos:*:*) + echo ${UNAME_MACHINE}-pc-rdos + exit ;; +esac + +#echo '(No uname command or uname output not recognized.)' 1>&2 +#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2 + +eval $set_cc_for_build +cat >$dummy.c < +# include +#endif +main () +{ +#if defined (sony) +#if defined (MIPSEB) + /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, + I don't know.... */ + printf ("mips-sony-bsd\n"); exit (0); +#else +#include + printf ("m68k-sony-newsos%s\n", +#ifdef NEWSOS4 + "4" +#else + "" +#endif + ); exit (0); +#endif +#endif + +#if defined (__arm) && defined (__acorn) && defined (__unix) + printf ("arm-acorn-riscix\n"); exit (0); +#endif + +#if defined (hp300) && !defined (hpux) + printf ("m68k-hp-bsd\n"); exit (0); +#endif + +#if defined (NeXT) +#if !defined (__ARCHITECTURE__) +#define __ARCHITECTURE__ "m68k" +#endif + int version; + version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; + if (version < 4) + printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); + else + printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); + exit (0); +#endif + +#if defined (MULTIMAX) || defined (n16) +#if defined (UMAXV) + printf ("ns32k-encore-sysv\n"); exit (0); +#else +#if defined (CMU) + printf ("ns32k-encore-mach\n"); exit (0); +#else + printf ("ns32k-encore-bsd\n"); exit (0); +#endif +#endif +#endif + +#if defined (__386BSD__) + printf ("i386-pc-bsd\n"); exit (0); +#endif + +#if defined (sequent) +#if defined (i386) + printf ("i386-sequent-dynix\n"); exit (0); +#endif +#if defined (ns32000) + printf ("ns32k-sequent-dynix\n"); exit (0); +#endif +#endif + +#if defined (_SEQUENT_) + struct utsname un; + + uname(&un); + + if (strncmp(un.version, "V2", 2) == 0) { + printf ("i386-sequent-ptx2\n"); exit (0); + } + if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ + printf ("i386-sequent-ptx1\n"); exit (0); + } + printf ("i386-sequent-ptx\n"); exit (0); + +#endif + +#if defined (vax) +# if !defined (ultrix) +# include +# if defined (BSD) +# if BSD == 43 + printf ("vax-dec-bsd4.3\n"); exit (0); +# else +# if BSD == 199006 + printf ("vax-dec-bsd4.3reno\n"); exit (0); +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# endif +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# else + printf ("vax-dec-ultrix\n"); exit (0); +# endif +#endif + +#if defined (alliant) && defined (i860) + printf ("i860-alliant-bsd\n"); exit (0); +#endif + + exit (1); +} +EOF + +$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` && + { echo "$SYSTEM_NAME"; exit; } + +# Apollos put the system type in the environment. + +test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; } + +# Convex versions that predate uname can use getsysinfo(1) + +if [ -x /usr/convex/getsysinfo ] +then + case `getsysinfo -f cpu_type` in + c1*) + echo c1-convex-bsd + exit ;; + c2*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit ;; + c34*) + echo c34-convex-bsd + exit ;; + c38*) + echo c38-convex-bsd + exit ;; + c4*) + echo c4-convex-bsd + exit ;; + esac +fi + +cat >&2 < in order to provide the needed +information to handle your system. + +config.guess timestamp = $timestamp + +uname -m = `(uname -m) 2>/dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null` + +hostinfo = `(hostinfo) 2>/dev/null` +/bin/universe = `(/bin/universe) 2>/dev/null` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` +/bin/arch = `(/bin/arch) 2>/dev/null` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` + +UNAME_MACHINE = ${UNAME_MACHINE} +UNAME_RELEASE = ${UNAME_RELEASE} +UNAME_SYSTEM = ${UNAME_SYSTEM} +UNAME_VERSION = ${UNAME_VERSION} +EOF + +exit 1 + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/libfec/config.sub b/libfec/config.sub new file mode 100755 index 0000000..a06a480 --- /dev/null +++ b/libfec/config.sub @@ -0,0 +1,1362 @@ +#! /bin/sh +# Configuration validation subroutine script. +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001 +# Free Software Foundation, Inc. + +timestamp='2001-04-20' + +# This file is (in principle) common to ALL GNU software. +# The presence of a machine in this file suggests that SOME GNU software +# can handle that machine. It does not imply ALL GNU software can. +# +# This file is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# Please send patches to . +# +# Configuration subroutine to validate and canonicalize a configuration type. +# Supply the specified configuration type as an argument. +# If it is invalid, we print an error message on stderr and exit with code 1. +# Otherwise, we print the canonical config type on stdout and succeed. + +# This file is supposed to be the same for all GNU packages +# and recognize all the CPU types, system types and aliases +# that are meaningful with *any* GNU software. +# Each package is responsible for reporting which valid configurations +# it does not support. The user should be able to distinguish +# a failure to support a valid configuration from a meaningless +# configuration. + +# The goal of this file is to map all the various variations of a given +# machine specification into a single specification in the form: +# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM +# or in some cases, the newer four-part form: +# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM +# It is wrong to echo any other type of specification. + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] CPU-MFR-OPSYS + $0 [OPTION] ALIAS + +Canonicalize a configuration name. + +Operation modes: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.sub ($timestamp) + +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001 +Free Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try \`$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit 0 ;; + --version | -v ) + echo "$version" ; exit 0 ;; + --help | --h* | -h ) + echo "$usage"; exit 0 ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" + exit 1 ;; + + *local*) + # First pass through any local machine types. + echo $1 + exit 0;; + + * ) + break ;; + esac +done + +case $# in + 0) echo "$me: missing argument$help" >&2 + exit 1;; + 1) ;; + *) echo "$me: too many arguments$help" >&2 + exit 1;; +esac + +# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). +# Here we must recognize all the valid KERNEL-OS combinations. +maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` +case $maybe_os in + nto-qnx* | linux-gnu* | storm-chaos* | os2-emx*) + os=-$maybe_os + basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` + ;; + *) + basic_machine=`echo $1 | sed 's/-[^-]*$//'` + if [ $basic_machine != $1 ] + then os=`echo $1 | sed 's/.*-/-/'` + else os=; fi + ;; +esac + +### Let's recognize common machines as not being operating systems so +### that things like config.sub decstation-3100 work. We also +### recognize some manufacturers as not being operating systems, so we +### can provide default operating systems below. +case $os in + -sun*os*) + # Prevent following clause from handling this invalid input. + ;; + -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ + -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ + -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ + -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ + -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ + -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ + -apple | -axis) + os= + basic_machine=$1 + ;; + -sim | -cisco | -oki | -wec | -winbond) + os= + basic_machine=$1 + ;; + -scout) + ;; + -wrs) + os=-vxworks + basic_machine=$1 + ;; + -hiux*) + os=-hiuxwe2 + ;; + -sco5) + os=-sco3.2v5 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco4) + os=-sco3.2v4 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco3.2.[4-9]*) + os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco3.2v[4-9]*) + # Don't forget version if it is 3.2v4 or newer. + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco*) + os=-sco3.2v2 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -udk*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -isc) + os=-isc2.2 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -clix*) + basic_machine=clipper-intergraph + ;; + -isc*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -lynx*) + os=-lynxos + ;; + -ptx*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` + ;; + -windowsnt*) + os=`echo $os | sed -e 's/windowsnt/winnt/'` + ;; + -psos*) + os=-psos + ;; + -mint | -mint[0-9]*) + basic_machine=m68k-atari + os=-mint + ;; +esac + +# Decode aliases for certain CPU-COMPANY combinations. +case $basic_machine in + # Recognize the basic CPU types without company name. + # Some are omitted here because they have special meanings below. + tahoe | i860 | ia64 | m32r | m68k | m68000 | m88k | ns32k | arc \ + | arm | arme[lb] | arm[bl]e | armv[2345] | armv[345][lb] | strongarm | xscale \ + | pyramid | mn10200 | mn10300 | tron | a29k \ + | 580 | i960 | h8300 \ + | x86 | ppcbe | mipsbe | mipsle | shbe | shle \ + | hppa | hppa1.0 | hppa1.1 | hppa2.0 | hppa2.0w | hppa2.0n \ + | hppa64 \ + | alpha | alphaev[4-8] | alphaev56 | alphapca5[67] \ + | alphaev6[78] \ + | we32k | ns16k | clipper | i370 | sh | sh[34] \ + | powerpc | powerpcle \ + | 1750a | dsp16xx | pdp10 | pdp11 \ + | mips16 | mips64 | mipsel | mips64el \ + | mips64orion | mips64orionel | mipstx39 | mipstx39el \ + | mips64vr4300 | mips64vr4300el | mips64vr4100 | mips64vr4100el \ + | mips64vr5000 | miprs64vr5000el | mcore | s390 | s390x \ + | sparc | sparclet | sparclite | sparc64 | sparcv9 | sparcv9b \ + | v850 | c4x \ + | thumb | d10v | d30v | fr30 | avr | openrisc | tic80 \ + | pj | pjl | h8500) + basic_machine=$basic_machine-unknown + ;; + m6811 | m68hc11 | m6812 | m68hc12) + # Motorola 68HC11/12. + basic_machine=$basic_machine-unknown + os=-none + ;; + m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | z8k | v70 | w65) + ;; + + # We use `pc' rather than `unknown' + # because (1) that's what they normally are, and + # (2) the word "unknown" tends to confuse beginning users. + i*86 | x86_64) + basic_machine=$basic_machine-pc + ;; + # Object if more than one company name word. + *-*-*) + echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + exit 1 + ;; + # Recognize the basic CPU types with company name. + # FIXME: clean up the formatting here. + vax-* | tahoe-* | i*86-* | i860-* | ia64-* | m32r-* | m68k-* | m68000-* \ + | m88k-* | sparc-* | ns32k-* | fx80-* | arc-* | c[123]* \ + | arm-* | armbe-* | armle-* | armv*-* | strongarm-* | xscale-* \ + | mips-* | pyramid-* | tron-* | a29k-* | romp-* | rs6000-* \ + | power-* | none-* | 580-* | cray2-* | h8300-* | h8500-* | i960-* \ + | xmp-* | ymp-* \ + | x86-* | ppcbe-* | mipsbe-* | mipsle-* | shbe-* | shle-* \ + | hppa-* | hppa1.0-* | hppa1.1-* | hppa2.0-* | hppa2.0w-* \ + | hppa2.0n-* | hppa64-* \ + | alpha-* | alphaev[4-8]-* | alphaev56-* | alphapca5[67]-* \ + | alphaev6[78]-* \ + | we32k-* | cydra-* | ns16k-* | pn-* | np1-* | xps100-* \ + | clipper-* | orion-* \ + | sparclite-* | pdp10-* | pdp11-* | sh-* | powerpc-* | powerpcle-* \ + | sparc64-* | sparcv9-* | sparcv9b-* | sparc86x-* \ + | mips16-* | mips64-* | mipsel-* \ + | mips64el-* | mips64orion-* | mips64orionel-* \ + | mips64vr4100-* | mips64vr4100el-* | mips64vr4300-* | mips64vr4300el-* \ + | mipstx39-* | mipstx39el-* | mcore-* \ + | f30[01]-* | f700-* | s390-* | s390x-* | sv1-* | t3e-* \ + | [cjt]90-* \ + | m88110-* | m680[01234]0-* | m683?2-* | m68360-* | z8k-* | d10v-* \ + | thumb-* | v850-* | d30v-* | tic30-* | tic80-* | c30-* | fr30-* \ + | bs2000-* | tic54x-* | c54x-* | x86_64-* | pj-* | pjl-*) + ;; + # Recognize the various machine names and aliases which stand + # for a CPU type and a company and sometimes even an OS. + 386bsd) + basic_machine=i386-unknown + os=-bsd + ;; + 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) + basic_machine=m68000-att + ;; + 3b*) + basic_machine=we32k-att + ;; + a29khif) + basic_machine=a29k-amd + os=-udi + ;; + adobe68k) + basic_machine=m68010-adobe + os=-scout + ;; + alliant | fx80) + basic_machine=fx80-alliant + ;; + altos | altos3068) + basic_machine=m68k-altos + ;; + am29k) + basic_machine=a29k-none + os=-bsd + ;; + amdahl) + basic_machine=580-amdahl + os=-sysv + ;; + amiga | amiga-*) + basic_machine=m68k-unknown + ;; + amigaos | amigados) + basic_machine=m68k-unknown + os=-amigaos + ;; + amigaunix | amix) + basic_machine=m68k-unknown + os=-sysv4 + ;; + apollo68) + basic_machine=m68k-apollo + os=-sysv + ;; + apollo68bsd) + basic_machine=m68k-apollo + os=-bsd + ;; + aux) + basic_machine=m68k-apple + os=-aux + ;; + balance) + basic_machine=ns32k-sequent + os=-dynix + ;; + convex-c1) + basic_machine=c1-convex + os=-bsd + ;; + convex-c2) + basic_machine=c2-convex + os=-bsd + ;; + convex-c32) + basic_machine=c32-convex + os=-bsd + ;; + convex-c34) + basic_machine=c34-convex + os=-bsd + ;; + convex-c38) + basic_machine=c38-convex + os=-bsd + ;; + cray | ymp) + basic_machine=ymp-cray + os=-unicos + ;; + cray2) + basic_machine=cray2-cray + os=-unicos + ;; + [cjt]90) + basic_machine=${basic_machine}-cray + os=-unicos + ;; + crds | unos) + basic_machine=m68k-crds + ;; + cris | cris-* | etrax*) + basic_machine=cris-axis + ;; + da30 | da30-*) + basic_machine=m68k-da30 + ;; + decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) + basic_machine=mips-dec + ;; + delta | 3300 | motorola-3300 | motorola-delta \ + | 3300-motorola | delta-motorola) + basic_machine=m68k-motorola + ;; + delta88) + basic_machine=m88k-motorola + os=-sysv3 + ;; + dpx20 | dpx20-*) + basic_machine=rs6000-bull + os=-bosx + ;; + dpx2* | dpx2*-bull) + basic_machine=m68k-bull + os=-sysv3 + ;; + ebmon29k) + basic_machine=a29k-amd + os=-ebmon + ;; + elxsi) + basic_machine=elxsi-elxsi + os=-bsd + ;; + encore | umax | mmax) + basic_machine=ns32k-encore + ;; + es1800 | OSE68k | ose68k | ose | OSE) + basic_machine=m68k-ericsson + os=-ose + ;; + fx2800) + basic_machine=i860-alliant + ;; + genix) + basic_machine=ns32k-ns + ;; + gmicro) + basic_machine=tron-gmicro + os=-sysv + ;; + go32) + basic_machine=i386-pc + os=-go32 + ;; + h3050r* | hiux*) + basic_machine=hppa1.1-hitachi + os=-hiuxwe2 + ;; + h8300hms) + basic_machine=h8300-hitachi + os=-hms + ;; + h8300xray) + basic_machine=h8300-hitachi + os=-xray + ;; + h8500hms) + basic_machine=h8500-hitachi + os=-hms + ;; + harris) + basic_machine=m88k-harris + os=-sysv3 + ;; + hp300-*) + basic_machine=m68k-hp + ;; + hp300bsd) + basic_machine=m68k-hp + os=-bsd + ;; + hp300hpux) + basic_machine=m68k-hp + os=-hpux + ;; + hp3k9[0-9][0-9] | hp9[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hp9k2[0-9][0-9] | hp9k31[0-9]) + basic_machine=m68000-hp + ;; + hp9k3[2-9][0-9]) + basic_machine=m68k-hp + ;; + hp9k6[0-9][0-9] | hp6[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hp9k7[0-79][0-9] | hp7[0-79][0-9]) + basic_machine=hppa1.1-hp + ;; + hp9k78[0-9] | hp78[0-9]) + # FIXME: really hppa2.0-hp + basic_machine=hppa1.1-hp + ;; + hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) + # FIXME: really hppa2.0-hp + basic_machine=hppa1.1-hp + ;; + hp9k8[0-9][13679] | hp8[0-9][13679]) + basic_machine=hppa1.1-hp + ;; + hp9k8[0-9][0-9] | hp8[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hppa-next) + os=-nextstep3 + ;; + hppaosf) + basic_machine=hppa1.1-hp + os=-osf + ;; + hppro) + basic_machine=hppa1.1-hp + os=-proelf + ;; + i370-ibm* | ibm*) + basic_machine=i370-ibm + ;; +# I'm not sure what "Sysv32" means. Should this be sysv3.2? + i*86v32) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv32 + ;; + i*86v4*) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv4 + ;; + i*86v) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv + ;; + i*86sol2) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-solaris2 + ;; + i386mach) + basic_machine=i386-mach + os=-mach + ;; + i386-vsta | vsta) + basic_machine=i386-unknown + os=-vsta + ;; + iris | iris4d) + basic_machine=mips-sgi + case $os in + -irix*) + ;; + *) + os=-irix4 + ;; + esac + ;; + isi68 | isi) + basic_machine=m68k-isi + os=-sysv + ;; + m88k-omron*) + basic_machine=m88k-omron + ;; + magnum | m3230) + basic_machine=mips-mips + os=-sysv + ;; + merlin) + basic_machine=ns32k-utek + os=-sysv + ;; + mingw32) + basic_machine=i386-pc + os=-mingw32 + ;; + miniframe) + basic_machine=m68000-convergent + ;; + *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*) + basic_machine=m68k-atari + os=-mint + ;; + mipsel*-linux*) + basic_machine=mipsel-unknown + os=-linux-gnu + ;; + mips*-linux*) + basic_machine=mips-unknown + os=-linux-gnu + ;; + mips3*-*) + basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` + ;; + mips3*) + basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown + ;; + mmix*) + basic_machine=mmix-knuth + os=-mmixware + ;; + monitor) + basic_machine=m68k-rom68k + os=-coff + ;; + msdos) + basic_machine=i386-pc + os=-msdos + ;; + mvs) + basic_machine=i370-ibm + os=-mvs + ;; + ncr3000) + basic_machine=i486-ncr + os=-sysv4 + ;; + netbsd386) + basic_machine=i386-unknown + os=-netbsd + ;; + netwinder) + basic_machine=armv4l-rebel + os=-linux + ;; + news | news700 | news800 | news900) + basic_machine=m68k-sony + os=-newsos + ;; + news1000) + basic_machine=m68030-sony + os=-newsos + ;; + news-3600 | risc-news) + basic_machine=mips-sony + os=-newsos + ;; + necv70) + basic_machine=v70-nec + os=-sysv + ;; + next | m*-next ) + basic_machine=m68k-next + case $os in + -nextstep* ) + ;; + -ns2*) + os=-nextstep2 + ;; + *) + os=-nextstep3 + ;; + esac + ;; + nh3000) + basic_machine=m68k-harris + os=-cxux + ;; + nh[45]000) + basic_machine=m88k-harris + os=-cxux + ;; + nindy960) + basic_machine=i960-intel + os=-nindy + ;; + mon960) + basic_machine=i960-intel + os=-mon960 + ;; + nonstopux) + basic_machine=mips-compaq + os=-nonstopux + ;; + np1) + basic_machine=np1-gould + ;; + nsr-tandem) + basic_machine=nsr-tandem + ;; + op50n-* | op60c-*) + basic_machine=hppa1.1-oki + os=-proelf + ;; + OSE68000 | ose68000) + basic_machine=m68000-ericsson + os=-ose + ;; + os68k) + basic_machine=m68k-none + os=-os68k + ;; + pa-hitachi) + basic_machine=hppa1.1-hitachi + os=-hiuxwe2 + ;; + paragon) + basic_machine=i860-intel + os=-osf + ;; + pbd) + basic_machine=sparc-tti + ;; + pbb) + basic_machine=m68k-tti + ;; + pc532 | pc532-*) + basic_machine=ns32k-pc532 + ;; + pentium | p5 | k5 | k6 | nexgen) + basic_machine=i586-pc + ;; + pentiumpro | p6 | 6x86 | athlon) + basic_machine=i686-pc + ;; + pentiumii | pentium2) + basic_machine=i686-pc + ;; + pentium-* | p5-* | k5-* | k6-* | nexgen-*) + basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentiumpro-* | p6-* | 6x86-* | athlon-*) + basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentiumii-* | pentium2-*) + basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pn) + basic_machine=pn-gould + ;; + power) basic_machine=power-ibm + ;; + ppc) basic_machine=powerpc-unknown + ;; + ppc-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ppcle | powerpclittle | ppc-le | powerpc-little) + basic_machine=powerpcle-unknown + ;; + ppcle-* | powerpclittle-*) + basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ps2) + basic_machine=i386-ibm + ;; + pw32) + basic_machine=i586-unknown + os=-pw32 + ;; + rom68k) + basic_machine=m68k-rom68k + os=-coff + ;; + rm[46]00) + basic_machine=mips-siemens + ;; + rtpc | rtpc-*) + basic_machine=romp-ibm + ;; + sa29200) + basic_machine=a29k-amd + os=-udi + ;; + sequent) + basic_machine=i386-sequent + ;; + sh) + basic_machine=sh-hitachi + os=-hms + ;; + sparclite-wrs) + basic_machine=sparclite-wrs + os=-vxworks + ;; + sps7) + basic_machine=m68k-bull + os=-sysv2 + ;; + spur) + basic_machine=spur-unknown + ;; + st2000) + basic_machine=m68k-tandem + ;; + stratus) + basic_machine=i860-stratus + os=-sysv4 + ;; + sun2) + basic_machine=m68000-sun + ;; + sun2os3) + basic_machine=m68000-sun + os=-sunos3 + ;; + sun2os4) + basic_machine=m68000-sun + os=-sunos4 + ;; + sun3os3) + basic_machine=m68k-sun + os=-sunos3 + ;; + sun3os4) + basic_machine=m68k-sun + os=-sunos4 + ;; + sun4os3) + basic_machine=sparc-sun + os=-sunos3 + ;; + sun4os4) + basic_machine=sparc-sun + os=-sunos4 + ;; + sun4sol2) + basic_machine=sparc-sun + os=-solaris2 + ;; + sun3 | sun3-*) + basic_machine=m68k-sun + ;; + sun4) + basic_machine=sparc-sun + ;; + sun386 | sun386i | roadrunner) + basic_machine=i386-sun + ;; + sv1) + basic_machine=sv1-cray + os=-unicos + ;; + symmetry) + basic_machine=i386-sequent + os=-dynix + ;; + t3e) + basic_machine=t3e-cray + os=-unicos + ;; + tic54x | c54x*) + basic_machine=tic54x-unknown + os=-coff + ;; + tx39) + basic_machine=mipstx39-unknown + ;; + tx39el) + basic_machine=mipstx39el-unknown + ;; + tower | tower-32) + basic_machine=m68k-ncr + ;; + udi29k) + basic_machine=a29k-amd + os=-udi + ;; + ultra3) + basic_machine=a29k-nyu + os=-sym1 + ;; + v810 | necv810) + basic_machine=v810-nec + os=-none + ;; + vaxv) + basic_machine=vax-dec + os=-sysv + ;; + vms) + basic_machine=vax-dec + os=-vms + ;; + vpp*|vx|vx-*) + basic_machine=f301-fujitsu + ;; + vxworks960) + basic_machine=i960-wrs + os=-vxworks + ;; + vxworks68) + basic_machine=m68k-wrs + os=-vxworks + ;; + vxworks29k) + basic_machine=a29k-wrs + os=-vxworks + ;; + w65*) + basic_machine=w65-wdc + os=-none + ;; + w89k-*) + basic_machine=hppa1.1-winbond + os=-proelf + ;; + xmp) + basic_machine=xmp-cray + os=-unicos + ;; + xps | xps100) + basic_machine=xps100-honeywell + ;; + z8k-*-coff) + basic_machine=z8k-unknown + os=-sim + ;; + none) + basic_machine=none-none + os=-none + ;; + +# Here we handle the default manufacturer of certain CPU types. It is in +# some cases the only manufacturer, in others, it is the most popular. + w89k) + basic_machine=hppa1.1-winbond + ;; + op50n) + basic_machine=hppa1.1-oki + ;; + op60c) + basic_machine=hppa1.1-oki + ;; + mips) + if [ x$os = x-linux-gnu ]; then + basic_machine=mips-unknown + else + basic_machine=mips-mips + fi + ;; + romp) + basic_machine=romp-ibm + ;; + rs6000) + basic_machine=rs6000-ibm + ;; + vax) + basic_machine=vax-dec + ;; + pdp10) + # there are many clones, so DEC is not a safe bet + basic_machine=pdp10-unknown + ;; + pdp11) + basic_machine=pdp11-dec + ;; + we32k) + basic_machine=we32k-att + ;; + sh3 | sh4) + basic_machine=sh-unknown + ;; + sparc | sparcv9 | sparcv9b) + basic_machine=sparc-sun + ;; + cydra) + basic_machine=cydra-cydrome + ;; + orion) + basic_machine=orion-highlevel + ;; + orion105) + basic_machine=clipper-highlevel + ;; + mac | mpw | mac-mpw) + basic_machine=m68k-apple + ;; + pmac | pmac-mpw) + basic_machine=powerpc-apple + ;; + c4x*) + basic_machine=c4x-none + os=-coff + ;; + *-unknown) + # Make sure to match an already-canonicalized machine name. + ;; + *) + echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + exit 1 + ;; +esac + +# Here we canonicalize certain aliases for manufacturers. +case $basic_machine in + *-digital*) + basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` + ;; + *-commodore*) + basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` + ;; + *) + ;; +esac + +# Decode manufacturer-specific aliases for certain operating systems. + +if [ x"$os" != x"" ] +then +case $os in + # First match some system type aliases + # that might get confused with valid system types. + # -solaris* is a basic system type, with this one exception. + -solaris1 | -solaris1.*) + os=`echo $os | sed -e 's|solaris1|sunos4|'` + ;; + -solaris) + os=-solaris2 + ;; + -svr4*) + os=-sysv4 + ;; + -unixware*) + os=-sysv4.2uw + ;; + -gnu/linux*) + os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` + ;; + # First accept the basic system types. + # The portable systems comes first. + # Each alternative MUST END IN A *, to match a version number. + # -sysv* is not here because it comes later, after sysvr4. + -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ + | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ + | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \ + | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ + | -aos* \ + | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ + | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ + | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \ + | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ + | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ + | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ + | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ + | -mingw32* | -linux-gnu* | -uxpv* | -beos* | -mpeix* | -udk* \ + | -interix* | -uwin* | -rhapsody* | -darwin* | -opened* \ + | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ + | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* | -os2*) + # Remember, each alternative MUST END IN *, to match a version number. + ;; + -qnx*) + case $basic_machine in + x86-* | i*86-*) + ;; + *) + os=-nto$os + ;; + esac + ;; + -nto*) + os=-nto-qnx + ;; + -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ + | -windows* | -osx | -abug | -netware* | -os9* | -beos* \ + | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) + ;; + -mac*) + os=`echo $os | sed -e 's|mac|macos|'` + ;; + -linux*) + os=`echo $os | sed -e 's|linux|linux-gnu|'` + ;; + -sunos5*) + os=`echo $os | sed -e 's|sunos5|solaris2|'` + ;; + -sunos6*) + os=`echo $os | sed -e 's|sunos6|solaris3|'` + ;; + -opened*) + os=-openedition + ;; + -wince*) + os=-wince + ;; + -osfrose*) + os=-osfrose + ;; + -osf*) + os=-osf + ;; + -utek*) + os=-bsd + ;; + -dynix*) + os=-bsd + ;; + -acis*) + os=-aos + ;; + -386bsd) + os=-bsd + ;; + -ctix* | -uts*) + os=-sysv + ;; + -ns2 ) + os=-nextstep2 + ;; + -nsk*) + os=-nsk + ;; + # Preserve the version number of sinix5. + -sinix5.*) + os=`echo $os | sed -e 's|sinix|sysv|'` + ;; + -sinix*) + os=-sysv4 + ;; + -triton*) + os=-sysv3 + ;; + -oss*) + os=-sysv3 + ;; + -svr4) + os=-sysv4 + ;; + -svr3) + os=-sysv3 + ;; + -sysvr4) + os=-sysv4 + ;; + # This must come after -sysvr4. + -sysv*) + ;; + -ose*) + os=-ose + ;; + -es1800*) + os=-ose + ;; + -xenix) + os=-xenix + ;; + -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) + os=-mint + ;; + -none) + ;; + *) + # Get rid of the `-' at the beginning of $os. + os=`echo $os | sed 's/[^-]*-//'` + echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 + exit 1 + ;; +esac +else + +# Here we handle the default operating systems that come with various machines. +# The value should be what the vendor currently ships out the door with their +# machine or put another way, the most popular os provided with the machine. + +# Note that if you're going to try to match "-MANUFACTURER" here (say, +# "-sun"), then you have to tell the case statement up towards the top +# that MANUFACTURER isn't an operating system. Otherwise, code above +# will signal an error saying that MANUFACTURER isn't an operating +# system, and we'll never get to this point. + +case $basic_machine in + *-acorn) + os=-riscix1.2 + ;; + arm*-rebel) + os=-linux + ;; + arm*-semi) + os=-aout + ;; + pdp10-*) + os=-tops20 + ;; + pdp11-*) + os=-none + ;; + *-dec | vax-*) + os=-ultrix4.2 + ;; + m68*-apollo) + os=-domain + ;; + i386-sun) + os=-sunos4.0.2 + ;; + m68000-sun) + os=-sunos3 + # This also exists in the configure program, but was not the + # default. + # os=-sunos4 + ;; + m68*-cisco) + os=-aout + ;; + mips*-cisco) + os=-elf + ;; + mips*-*) + os=-elf + ;; + *-tti) # must be before sparc entry or we get the wrong os. + os=-sysv3 + ;; + sparc-* | *-sun) + os=-sunos4.1.1 + ;; + *-be) + os=-beos + ;; + *-ibm) + os=-aix + ;; + *-wec) + os=-proelf + ;; + *-winbond) + os=-proelf + ;; + *-oki) + os=-proelf + ;; + *-hp) + os=-hpux + ;; + *-hitachi) + os=-hiux + ;; + i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) + os=-sysv + ;; + *-cbm) + os=-amigaos + ;; + *-dg) + os=-dgux + ;; + *-dolphin) + os=-sysv3 + ;; + m68k-ccur) + os=-rtu + ;; + m88k-omron*) + os=-luna + ;; + *-next ) + os=-nextstep + ;; + *-sequent) + os=-ptx + ;; + *-crds) + os=-unos + ;; + *-ns) + os=-genix + ;; + i370-*) + os=-mvs + ;; + *-next) + os=-nextstep3 + ;; + *-gould) + os=-sysv + ;; + *-highlevel) + os=-bsd + ;; + *-encore) + os=-bsd + ;; + *-sgi) + os=-irix + ;; + *-siemens) + os=-sysv4 + ;; + *-masscomp) + os=-rtu + ;; + f30[01]-fujitsu | f700-fujitsu) + os=-uxpv + ;; + *-rom68k) + os=-coff + ;; + *-*bug) + os=-coff + ;; + *-apple) + os=-macos + ;; + *-atari*) + os=-mint + ;; + *) + os=-none + ;; +esac +fi + +# Here we handle the case where we know the os, and the CPU type, but not the +# manufacturer. We pick the logical manufacturer. +vendor=unknown +case $basic_machine in + *-unknown) + case $os in + -riscix*) + vendor=acorn + ;; + -sunos*) + vendor=sun + ;; + -aix*) + vendor=ibm + ;; + -beos*) + vendor=be + ;; + -hpux*) + vendor=hp + ;; + -mpeix*) + vendor=hp + ;; + -hiux*) + vendor=hitachi + ;; + -unos*) + vendor=crds + ;; + -dgux*) + vendor=dg + ;; + -luna*) + vendor=omron + ;; + -genix*) + vendor=ns + ;; + -mvs* | -opened*) + vendor=ibm + ;; + -ptx*) + vendor=sequent + ;; + -vxsim* | -vxworks*) + vendor=wrs + ;; + -aux*) + vendor=apple + ;; + -hms*) + vendor=hitachi + ;; + -mpw* | -macos*) + vendor=apple + ;; + -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) + vendor=atari + ;; + esac + basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` + ;; +esac + +echo $basic_machine$os +exit 0 + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/libfec/configure.in b/libfec/configure.in new file mode 100644 index 0000000..10b5380 --- /dev/null +++ b/libfec/configure.in @@ -0,0 +1,90 @@ +dnl Process this file with autoconf to produce a configure script. +AC_INIT(viterbi27.c) +AC_CONFIG_HEADER(config.h) +SO_NAME=3 +VERSION=3.0.0 +AC_SUBST(SO_NAME) +AC_SUBST(VERSION) + +dnl Checks for programs. +AC_PROG_CC +if test $GCC != "yes" +then + AC_MSG_ERROR([Need GNU C compiler]) +fi +dnl Checks for libraries. +AC_CHECK_LIB(c, malloc) + +dnl Checks for header files. +AC_CHECK_HEADERS(getopt.h stdio.h stdlib.h memory.h string.h) +if test -z "$HAVE_stdio.h" +then + AC_MSG_ERROR([Need stdio.h!]) +fi +if test -z "$HAVE_stdlib.h" +then + AC_MSG_ERROR([Need stdlib.h!]) +fi +if test -z "$HAVE_stdlib.h" +then + AC_MSG_ERROR([Need memory.h!]) +fi +if test -z "$HAVE_string.h" +then + AC_MSG_ERROR([Need string.h]) +fi + +AC_CANONICAL_SYSTEM +case $target_cpu in +x86_64) + ARCH_OPTION="-msse2" + MLIBS="dotprod_port.o \ + peakval_port.o \ + sumsq.o sumsq_port.o \ + cpu_mode_x86_64.o" + ;; +i386|i486|i586|i686) + ARCH_OPTION="-march=$target_cpu" + MLIBS="viterbi27_mmx.o mmxbfly27.o viterbi27_sse.o ssebfly27.o viterbi27_sse2.o sse2bfly27.o \ + viterbi29_mmx.o mmxbfly29.o viterbi29_sse.o ssebfly29.o viterbi29_sse2.o sse2bfly29.o \ + viterbi39_sse2.o viterbi39_sse.o viterbi39_mmx.o \ + viterbi615_mmx.o viterbi615_sse.o viterbi615_sse2.o \ + dotprod_mmx.o dotprod_mmx_assist.o \ + dotprod_sse2.o dotprod_sse2_assist.o \ + peakval_mmx.o peakval_mmx_assist.o \ + peakval_sse.o peakval_sse_assist.o \ + peakval_sse2.o peakval_sse2_assist.o \ + sumsq.o sumsq_port.o \ + sumsq_sse2.o sumsq_sse2_assist.o \ + sumsq_mmx.o sumsq_mmx_assist.o \ + cpu_features.o cpu_mode_x86.o" + ;; +powerpc*) + ARCH_OPTION="-fno-common -faltivec" + MLIBS="viterbi27_av.o viterbi29_av.o viterbi39_av.o viterbi615_av.o \ + encode_rs_av.o \ + dotprod_av.o sumsq_av.o peakval_av.o cpu_mode_ppc.o" + ;; +*) + MLIBS="cpu_mode_generic.o" +esac +case $target_os in +darwin*) + SH_LIB=libfec.dylib + REBIND="" + ;; +*) + SH_LIB=libfec.so + REBIND=ldconfig + ;; +esac +AC_SUBST(SH_LIB) +AC_SUBST(REBIND) +AC_SUBST(MLIBS) +AC_SUBST(ARCH_OPTION) + + +dnl Checks for library functions. +AC_CHECK_FUNCS(getopt_long memset memmove) + +AC_OUTPUT(makefile) diff --git a/libfec/cpu_features.s b/libfec/cpu_features.s new file mode 100644 index 0000000..ef4ba4e --- /dev/null +++ b/libfec/cpu_features.s @@ -0,0 +1,15 @@ +.text +.global cpu_features + .type cpu_features,@function +cpu_features: + pushl %ebx + pushl %ecx + pushl %edx + movl $1,%eax + cpuid + movl %edx,%eax + popl %edx + popl %ecx + popl %ebx + ret + \ No newline at end of file diff --git a/libfec/cpu_mode_generic.c b/libfec/cpu_mode_generic.c new file mode 100644 index 0000000..500f995 --- /dev/null +++ b/libfec/cpu_mode_generic.c @@ -0,0 +1,13 @@ +/* Determine CPU support for SIMD on Power PC + * Copyright 2004 Phil Karn, KA9Q + * Copyright 2014 Matthias P. Braendli, HB9EGM + */ +#include +#include "fec.h" + +enum cpu_mode Cpu_mode; + +// Use the portable code for this unknown CPU +void find_cpu_mode(void) { + Cpu_mode = PORT; +} diff --git a/libfec/cpu_mode_ppc.c b/libfec/cpu_mode_ppc.c new file mode 100644 index 0000000..0071558 --- /dev/null +++ b/libfec/cpu_mode_ppc.c @@ -0,0 +1,40 @@ +/* Determine CPU support for SIMD on Power PC + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include "fec.h" +#ifdef __VEC__ +#include +#endif + +/* Various SIMD instruction set names */ +char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)", + "x86 Streaming SIMD Extensions (SSE)", + "x86 Streaming SIMD Extensions 2 (SSE2)", + "PowerPC G4/G5 Altivec/Velocity Engine"}; + +enum cpu_mode Cpu_mode; + +void find_cpu_mode(void){ + + if(Cpu_mode != UNKNOWN) + return; + +#ifdef __VEC__ + { + /* Ask the OS if we have Altivec support */ + int selectors[2] = { CTL_HW, HW_VECTORUNIT }; + int hasVectorUnit = 0; + size_t length = sizeof(hasVectorUnit); + int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); + if(0 == error && hasVectorUnit) + Cpu_mode = ALTIVEC; + else + Cpu_mode = PORT; + } +#else + Cpu_mode = PORT; +#endif + + fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]); +} diff --git a/libfec/cpu_mode_x86.c b/libfec/cpu_mode_x86.c new file mode 100644 index 0000000..322018e --- /dev/null +++ b/libfec/cpu_mode_x86.c @@ -0,0 +1,33 @@ +/* Determine CPU support for SIMD + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include "fec.h" + +/* Various SIMD instruction set names */ +char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)", + "x86 Streaming SIMD Extensions (SSE)", + "x86 Streaming SIMD Extensions 2 (SSE2)", + "PowerPC G4/G5 Altivec/Velocity Engine"}; + +enum cpu_mode Cpu_mode; + +void find_cpu_mode(void){ + + int f; + if(Cpu_mode != UNKNOWN) + return; + + /* Figure out what kind of CPU we have */ + f = cpu_features(); + if(f & (1<<26)){ /* SSE2 is present */ + Cpu_mode = SSE2; + } else if(f & (1<<25)){ /* SSE is present */ + Cpu_mode = SSE; + } else if(f & (1<<23)){ /* MMX is present */ + Cpu_mode = MMX; + } else { /* No SIMD at all */ + Cpu_mode = PORT; + } + fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]); +} diff --git a/libfec/cpu_mode_x86_64.c b/libfec/cpu_mode_x86_64.c new file mode 100644 index 0000000..758096a --- /dev/null +++ b/libfec/cpu_mode_x86_64.c @@ -0,0 +1,27 @@ +/* Determine CPU support for SIMD + * Copyright 2004 Phil Karn, KA9Q + * + * Modified in 2012 by Matthias P. Braendli, HB9EGM + */ +#include +#include "fec.h" + +/* Various SIMD instruction set names */ +char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)", + "x86 Streaming SIMD Extensions (SSE)", + "x86 Streaming SIMD Extensions 2 (SSE2)", + "PowerPC G4/G5 Altivec/Velocity Engine"}; + +enum cpu_mode Cpu_mode; + +void find_cpu_mode(void){ + + int f; + if(Cpu_mode != UNKNOWN) + return; + + /* According to the wikipedia entry x86-64, all x86-64 processors have SSE2 */ + /* The same assumption is also in other source files ! */ + Cpu_mode = SSE2; + fprintf(stderr,"CPU: x86-64, using portable C implementation\n"); +} diff --git a/libfec/decode_rs.c b/libfec/decode_rs.c new file mode 100644 index 0000000..d7f97b3 --- /dev/null +++ b/libfec/decode_rs.c @@ -0,0 +1,262 @@ +/* Reed-Solomon decoder + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifdef DEBUG +#include +#endif + +#include + +#define NULL ((void *)0) +#define min(a,b) ((a) < (b) ? (a) : (b)) + +#ifdef FIXED +#include "fixed.h" +#elif defined(BIGSYM) +#include "int.h" +#else +#include "char.h" +#endif + +int DECODE_RS( +#ifdef FIXED +data_t *data, int *eras_pos, int no_eras,int pad){ +#else +void *p,data_t *data, int *eras_pos, int no_eras){ + struct rs *rs = (struct rs *)p; +#endif + int deg_lambda, el, deg_omega; + int i, j, r,k; + data_t u,q,tmp,num1,num2,den,discr_r; + data_t lambda[NROOTS+1], s[NROOTS]; /* Err+Eras Locator poly + * and syndrome poly */ + data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1]; + data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS]; + int syn_error, count; + +#ifdef FIXED + /* Check pad parameter for validity */ + if(pad < 0 || pad >= NN) + return -1; +#endif + + /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */ + for(i=0;i 0) { + /* Init lambda to be the erasure locator polynomial */ + lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))]; + for (i = 1; i < no_eras; i++) { + u = MODNN(PRIM*(NN-1-eras_pos[i])); + for (j = i+1; j > 0; j--) { + tmp = INDEX_OF[lambda[j - 1]]; + if(tmp != A0) + lambda[j] ^= ALPHA_TO[MODNN(u + tmp)]; + } + } + +#if DEBUG >= 1 + /* Test code that verifies the erasure locator polynomial just constructed + Needed only for decoder debugging. */ + + /* find roots of the erasure location polynomial */ + for(i=1;i<=no_eras;i++) + reg[i] = INDEX_OF[lambda[i]]; + + count = 0; + for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) { + q = 1; + for (j = 1; j <= no_eras; j++) + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + if (q != 0) + continue; + /* store root and error location number indices */ + root[count] = i; + loc[count] = k; + count++; + } + if (count != no_eras) { + printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras); + count = -1; + goto finish; + } +#if DEBUG >= 2 + printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n"); + for (i = 0; i < count; i++) + printf("%d ", loc[i]); + printf("\n"); +#endif +#endif + } + for(i=0;i 0; j--){ + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + } + if (q != 0) + continue; /* Not a root */ + /* store root (index-form) and error location number */ +#if DEBUG>=2 + printf("count %d root %d loc %d\n",count,i,k); +#endif + root[count] = i; + loc[count] = k; + /* If we've already found max possible roots, + * abort the search to save time + */ + if(++count == deg_lambda) + break; + } + if (deg_lambda != count) { + /* + * deg(lambda) unequal to number of roots => uncorrectable + * error detected + */ + count = -1; + goto finish; + } + /* + * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo + * x**NROOTS). in index form. Also find deg(omega). + */ + deg_omega = deg_lambda-1; + for (i = 0; i <= deg_omega;i++){ + tmp = 0; + for(j=i;j >= 0; j--){ + if ((s[i - j] != A0) && (lambda[j] != A0)) + tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])]; + } + omega[i] = INDEX_OF[tmp]; + } + + /* + * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 = + * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form + */ + for (j = count-1; j >=0; j--) { + num1 = 0; + for (i = deg_omega; i >= 0; i--) { + if (omega[i] != A0) + num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])]; + } + num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)]; + den = 0; + + /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */ + for (i = min(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) { + if(lambda[i+1] != A0) + den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])]; + } +#if DEBUG >= 1 + if (den == 0) { + printf("\n ERROR: denominator = 0\n"); + count = -1; + goto finish; + } +#endif + /* Apply error to data */ + if (num1 != 0 && loc[j] >= PAD) { + data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])]; + } + } + finish: + if(eras_pos != NULL){ + for(i=0;i) must be included by the calling + * program. + */ + + +#if !defined(NROOTS) +#error "NROOTS not defined" +#endif + +#if !defined(NN) +#error "NN not defined" +#endif + +#if !defined(PAD) +#error "PAD not defined" +#endif + +#if !defined(ALPHA_TO) +#error "ALPHA_TO not defined" +#endif + +#if !defined(INDEX_OF) +#error "INDEX_OF not defined" +#endif + +#if !defined(MODNN) +#error "MODNN not defined" +#endif + +#if !defined(FCR) +#error "FCR not defined" +#endif + +#if !defined(PRIM) +#error "PRIM not defined" +#endif + +#if !defined(NULL) +#define NULL ((void *)0) +#endif + +#undef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#undef A0 +#define A0 (NN) + +{ + int deg_lambda, el, deg_omega; + int i, j, r,k; + data_t u,q,tmp,num1,num2,den,discr_r; + data_t lambda[NROOTS+1], s[NROOTS]; /* Err+Eras Locator poly + * and syndrome poly */ + data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1]; + data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS]; + int syn_error, count; + + /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */ + for(i=0;i 0) { + /* Init lambda to be the erasure locator polynomial */ + lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))]; + for (i = 1; i < no_eras; i++) { + u = MODNN(PRIM*(NN-1-eras_pos[i])); + for (j = i+1; j > 0; j--) { + tmp = INDEX_OF[lambda[j - 1]]; + if(tmp != A0) + lambda[j] ^= ALPHA_TO[MODNN(u + tmp)]; + } + } + +#if DEBUG >= 1 + /* Test code that verifies the erasure locator polynomial just constructed + Needed only for decoder debugging. */ + + /* find roots of the erasure location polynomial */ + for(i=1;i<=no_eras;i++) + reg[i] = INDEX_OF[lambda[i]]; + + count = 0; + for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) { + q = 1; + for (j = 1; j <= no_eras; j++) + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + if (q != 0) + continue; + /* store root and error location number indices */ + root[count] = i; + loc[count] = k; + count++; + } + if (count != no_eras) { + printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras); + count = -1; + goto finish; + } +#if DEBUG >= 2 + printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n"); + for (i = 0; i < count; i++) + printf("%d ", loc[i]); + printf("\n"); +#endif +#endif + } + for(i=0;i 0; j--){ + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + } + if (q != 0) + continue; /* Not a root */ + /* store root (index-form) and error location number */ +#if DEBUG>=2 + printf("count %d root %d loc %d\n",count,i,k); +#endif + root[count] = i; + loc[count] = k; + /* If we've already found max possible roots, + * abort the search to save time + */ + if(++count == deg_lambda) + break; + } + if (deg_lambda != count) { + /* + * deg(lambda) unequal to number of roots => uncorrectable + * error detected + */ + count = -1; + goto finish; + } + /* + * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo + * x**NROOTS). in index form. Also find deg(omega). + */ + deg_omega = deg_lambda-1; + for (i = 0; i <= deg_omega;i++){ + tmp = 0; + for(j=i;j >= 0; j--){ + if ((s[i - j] != A0) && (lambda[j] != A0)) + tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])]; + } + omega[i] = INDEX_OF[tmp]; + } + + /* + * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 = + * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form + */ + for (j = count-1; j >=0; j--) { + num1 = 0; + for (i = deg_omega; i >= 0; i--) { + if (omega[i] != A0) + num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])]; + } + num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)]; + den = 0; + + /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */ + for (i = MIN(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) { + if(lambda[i+1] != A0) + den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])]; + } +#if DEBUG >= 1 + if (den == 0) { + printf("\n ERROR: denominator = 0\n"); + count = -1; + goto finish; + } +#endif + /* Apply error to data */ + if (num1 != 0 && loc[j] >= PAD) { + data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])]; + } + } + finish: + if(eras_pos != NULL){ + for(i=0;i +#endif + +#include + +#include "fixed.h" + +int decode_rs_8(data_t *data, int *eras_pos, int no_eras, int pad){ + int retval; + + if(pad < 0 || pad > 222){ + return -1; + } + +#include "decode_rs.h" + + return retval; +} diff --git a/libfec/decode_rs_ccsds.c b/libfec/decode_rs_ccsds.c new file mode 100644 index 0000000..0e246b4 --- /dev/null +++ b/libfec/decode_rs_ccsds.c @@ -0,0 +1,26 @@ +/* This function wraps around the fixed 8-bit decoder, performing the + * basis transformations necessary to meet the CCSDS standard + * + * Copyright 2002, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include "ccsds.h" +#include "fec.h" + +int decode_rs_ccsds(data_t *data,int *eras_pos,int no_eras,int pad){ + int i,r; + data_t cdata[NN]; + + /* Convert data from dual basis to conventional */ + for(i=0;i 0){ + /* Convert from conventional to dual basis */ + for(i=0;i +#endif + +#include + +#include "char.h" +#include "rs-common.h" + +int decode_rs_char(void *p, data_t *data, int *eras_pos, int no_eras){ + int retval; + struct rs *rs = (struct rs *)p; + +#include "decode_rs.h" + + return retval; +} diff --git a/libfec/decode_rs_int.c b/libfec/decode_rs_int.c new file mode 100644 index 0000000..1ef1a1f --- /dev/null +++ b/libfec/decode_rs_int.c @@ -0,0 +1,22 @@ +/* General purpose Reed-Solomon decoder + * Copyright 2003 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifdef DEBUG +#include +#endif + +#include + +#include "int.h" +#include "rs-common.h" + +int decode_rs_int(void *p, data_t *data, int *eras_pos, int no_eras){ + int retval; + struct rs *rs = (struct rs *)p; + +#include "decode_rs.h" + + return retval; +} diff --git a/libfec/dotprod.c b/libfec/dotprod.c new file mode 100644 index 0000000..5fb1da9 --- /dev/null +++ b/libfec/dotprod.c @@ -0,0 +1,111 @@ +/* 16-bit signed integer dot product + * Switch to appropriate versions + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include "fec.h" + +void *initdp_port(signed short coeffs[],int len); +long dotprod_port(void *p,signed short *b); +void freedp_port(void *p); + +#ifdef __i386__ +void *initdp_mmx(signed short coeffs[],int len); +void *initdp_sse2(signed short coeffs[],int len); +long dotprod_mmx(void *p,signed short *b); +long dotprod_sse2(void *p,signed short *b); +void freedp_mmx(void *p); +void freedp_sse2(void *p); +#endif + +#ifdef __VEC__ +void *initdp_av(signed short coeffs[],int len); +long dotprod_av(void *p,signed short *b); +void freedp_av(void *p); +#endif + +/* Create and return a descriptor for use with the dot product function */ +void *initdp(signed short coeffs[],int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return initdp_port(coeffs,len); +#ifdef __i386__ + case MMX: + case SSE: + return initdp_mmx(coeffs,len); + case SSE2: + return initdp_sse2(coeffs,len); +#endif + +#ifdef __x86_64__ + case SSE2: + return initdp_port(coeffs,len); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return initdp_av(coeffs,len); +#endif + } +} + + +/* Free a dot product descriptor created earlier */ +void freedp(void *p){ + switch(Cpu_mode){ + case PORT: + default: + return freedp_port(p); +#ifdef __i386__ + case MMX: + case SSE: + return freedp_mmx(p); + case SSE2: + return freedp_sse2(p); +#endif + +#ifdef __x86_64__ + case SSE2: + return freedp_port(p); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return freedp_av(p); +#endif + } +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod(void *p,signed short a[]){ + switch(Cpu_mode){ + case PORT: + default: + return dotprod_port(p,a); +#ifdef __i386__ + case MMX: + case SSE: + return dotprod_mmx(p,a); + case SSE2: + return dotprod_sse2(p,a); +#endif + +#ifdef __x86_64__ + case SSE2: + return dotprod_port(p,a); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return dotprod_av(p,a); +#endif + } +} + + diff --git a/libfec/dotprod.h b/libfec/dotprod.h new file mode 100644 index 0000000..6b62b70 --- /dev/null +++ b/libfec/dotprod.h @@ -0,0 +1,15 @@ +/* Internal definitions for dotproduct function */ + +struct dotprod { + int len; /* Number of coefficients */ + + /* On a MMX or SSE machine, these hold 4 copies of the coefficients, + * preshifted by 0,1,2,3 words to meet all possible input data + * alignments (see Intel ap559 on MMX dot products). + * + * SSE2 is similar, but with 8 words at a time + * + * On a non-MMX machine, only one copy is present + */ + signed short *coeffs[8]; +}; diff --git a/libfec/dotprod_av.c b/libfec/dotprod_av.c new file mode 100644 index 0000000..1f70471 --- /dev/null +++ b/libfec/dotprod_av.c @@ -0,0 +1,93 @@ +/* 16-bit signed integer dot product + * Altivec-assisted version + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + /* On an Altivec machine, these hold 8 copies of the coefficients, + * preshifted by 0,1,..7 words to meet all possible input data + */ + signed short *coeffs[8]; +}; + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_av(signed short coeffs[],int len){ + struct dotprod *dp; + int i,j; + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Make 8 copies of coefficients, one for each data alignment, + * each aligned to 16-byte boundary + */ + for(i=0;i<8;i++){ + dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short)); + for(j=0;jcoeffs[i][j+i] = coeffs[j]; + } + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_av(void *p){ + struct dotprod *dp = (struct dotprod *)p; + int i; + + for(i=0;i<8;i++) + if(dp->coeffs[i] != NULL) + free(dp->coeffs[i]); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_av(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + int al; + vector signed short *ar,*d; + vector signed int sums0,sums1,sums2,sums3; + union { vector signed int v; signed int w[4];} s; + int nblocks; + + /* round ar down to beginning of 16-byte block containing 0th element of + * input buffer. Then set d to one of 8 sets of shifted coefficients + */ + ar = (vector signed short *)((int)a & ~15); + al = ((int)a & 15)/sizeof(signed short); + d = (vector signed short *)dp->coeffs[al]; + + nblocks = (dp->len+al-1)/8+1; + + /* Sum into four vectors each holding four 32-bit partial sums */ + sums3 = sums2 = sums1 = sums0 = (vector signed int)(0); + while(nblocks >= 4){ + sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0); + sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1); + sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2); + sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3); + nblocks -= 4; + } + sums0 = vec_adds(sums0,sums1); + sums2 = vec_adds(sums2,sums3); + sums0 = vec_adds(sums0,sums2); + while(nblocks-- > 0){ + sums0 = vec_msums(ar[nblocks],d[nblocks],sums0); + } + /* Sum 4 partial sums into final result */ + s.v = vec_sums(sums0,(vector signed int)(0)); + + return s.w[3]; +} + + diff --git a/libfec/dotprod_mmx.c b/libfec/dotprod_mmx.c new file mode 100644 index 0000000..c516afe --- /dev/null +++ b/libfec/dotprod_mmx.c @@ -0,0 +1,81 @@ +/* 16-bit signed integer dot product + * MMX assisted version; also for SSE + * + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + /* On a MMX or SSE machine, these hold 4 copies of the coefficients, + * preshifted by 0,1,2,3 words to meet all possible input data + * alignments (see Intel ap559 on MMX dot products). + */ + signed short *coeffs[4]; +}; +long dotprod_mmx_assist(signed short *a,signed short *b,int cnt); + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_mmx(signed short coeffs[],int len){ + struct dotprod *dp; + int i,j; + + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Make 4 copies of coefficients, one for each data alignment */ + for(i=0;i<4;i++){ + dp->coeffs[i] = (signed short *)calloc(1+(len+i-1)/4, + 4*sizeof(signed short)); + for(j=0;jcoeffs[i][j+i] = coeffs[j]; + } + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_mmx(void *p){ + struct dotprod *dp = (struct dotprod *)p; + int i; + + for(i=0;i<4;i++) + if(dp->coeffs[i] != NULL) + free(dp->coeffs[i]); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_mmx(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + int al; + signed short *ar; + + /* Round input data address down to 8 byte boundary + * NB: depending on the alignment of a[], memory + * before a[] will be accessed. The contents don't matter since they'll + * be multiplied by zero coefficients. I can't conceive of any + * situation where this could cause a segfault since memory protection + * in the x86 machines is done on much larger boundaries + */ + ar = (signed short *)((int)a & ~7); + + /* Choose one of 4 sets of pre-shifted coefficients. al is both the + * index into dp->coeffs[] and the number of 0 words padded onto + * that coefficients array for alignment purposes + */ + al = a - ar; + + /* Call assembler routine to do the work, passing number of 4-word blocks */ + return dotprod_mmx_assist(ar,dp->coeffs[al],(dp->len+al-1)/4+1); +} + diff --git a/libfec/dotprod_mmx_assist.s b/libfec/dotprod_mmx_assist.s new file mode 100644 index 0000000..25deffd --- /dev/null +++ b/libfec/dotprod_mmx_assist.s @@ -0,0 +1,83 @@ +# SIMD MMX dot product +# Equivalent to the following C code: +# long dotprod(signed short *a,signed short *b,int cnt) +# { +# long sum = 0; +# cnt *= 4; +# while(cnt--) +# sum += *a++ + *b++; +# return sum; +# } +# a and b should also be 64-bit aligned, or speed will suffer greatly +# Copyright 1999, Phil Karn KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + .global dotprod_mmx_assist + .type dotprod_mmx_assist,@function +dotprod_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %ecx + pushl %ebx + movl 8(%ebp),%esi # a + movl 12(%ebp),%edi # b + movl 16(%ebp),%ecx # cnt + pxor %mm0,%mm0 # clear running sum (in two 32-bit halves) + +# MMX dot product loop unrolled 4 times, crunching 16 terms per loop + .align 16 +.Loop1: subl $4,%ecx + jl .Loop1Done + + movq (%esi),%mm1 # mm1 = a[3],a[2],a[1],a[0] + pmaddwd (%edi),%mm1 # mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0] + paddd %mm1,%mm0 + + movq 8(%esi),%mm1 + pmaddwd 8(%edi),%mm1 + paddd %mm1,%mm0 + + movq 16(%esi),%mm1 + pmaddwd 16(%edi),%mm1 + paddd %mm1,%mm0 + + movq 24(%esi),%mm1 + addl $32,%esi + pmaddwd 24(%edi),%mm1 + addl $32,%edi + paddd %mm1,%mm0 + + jmp .Loop1 +.Loop1Done: + + addl $4,%ecx + +# MMX dot product loop, not unrolled, crunching 4 terms per loop +# This could be redone as Duff's Device on the unrolled loop above +.Loop2: subl $1,%ecx + jl .Loop2Done + + movq (%esi),%mm1 + addl $8,%esi + pmaddwd (%edi),%mm1 + addl $8,%edi + paddd %mm1,%mm0 + jmp .Loop2 +.Loop2Done: + + movd %mm0,%ebx # right-hand word to ebx + punpckhdq %mm0,%mm0 # left-hand word to right side of %mm0 + movd %mm0,%eax + addl %ebx,%eax # running sum now in %eax + emms # done with MMX + + popl %ebx + popl %ecx + popl %edi + popl %esi + movl %ebp,%esp + popl %ebp + ret diff --git a/libfec/dotprod_port.c b/libfec/dotprod_port.c new file mode 100644 index 0000000..ef635ec --- /dev/null +++ b/libfec/dotprod_port.c @@ -0,0 +1,58 @@ +/* 16-bit signed integer dot product + * Portable C version + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + signed short *coeffs; +}; + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_port(signed short coeffs[],int len){ + struct dotprod *dp; + int j; + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Just one copy of the coefficients for the C version */ + dp->coeffs = (signed short *)calloc(len,sizeof(signed short)); + for(j=0;jcoeffs[j] = coeffs[j]; + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_port(void *p){ + struct dotprod *dp = (struct dotprod *)p; + + if(dp->coeffs != NULL) + free(dp->coeffs); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_port(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + long corr; + int i; + + corr = 0; + for(i=0;ilen;i++){ + corr += (long)a[i] * dp->coeffs[i]; + } + return corr; +} + + diff --git a/libfec/dotprod_sse2.c b/libfec/dotprod_sse2.c new file mode 100644 index 0000000..1fddd18 --- /dev/null +++ b/libfec/dotprod_sse2.c @@ -0,0 +1,72 @@ +/* 16-bit signed integer dot product + * SSE2 version + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#define _XOPEN_SOURCE 600 +#include +#include +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + /* On a SSE2 machine, these hold 8 copies of the coefficients, + * preshifted by 0,1,..7 words to meet all possible input data + * alignments (see Intel ap559 on MMX dot products). + */ + signed short *coeffs[8]; +}; + +long dotprod_sse2_assist(signed short *a,signed short *b,int cnt); + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_sse2(signed short coeffs[],int len){ + struct dotprod *dp; + int i,j,blksize; + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Make 8 copies of coefficients, one for each data alignment, + * each aligned to 16-byte boundary + */ + for(i=0;i<8;i++){ + blksize = (1+(len+i-1)/8) * 8*sizeof(signed short); + posix_memalign((void **)&dp->coeffs[i],16,blksize); + memset(dp->coeffs[i],0,blksize); + for(j=0;jcoeffs[i][j+i] = coeffs[j]; + } + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_sse2(void *p){ + struct dotprod *dp = (struct dotprod *)p; + int i; + + for(i=0;i<8;i++) + if(dp->coeffs[i] != NULL) + free(dp->coeffs[i]); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_sse2(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + int al; + signed short *ar; + + ar = (signed short *)((int)a & ~15); + al = a - ar; + + /* Call assembler routine to do the work, passing number of 8-word blocks */ + return dotprod_sse2_assist(ar,dp->coeffs[al],(dp->len+al-1)/8+1); +} diff --git a/libfec/dotprod_sse2_assist.s b/libfec/dotprod_sse2_assist.s new file mode 100644 index 0000000..47348fa --- /dev/null +++ b/libfec/dotprod_sse2_assist.s @@ -0,0 +1,85 @@ +# SIMD SSE2 dot product +# Equivalent to the following C code: +# long dotprod(signed short *a,signed short *b,int cnt) +# { +# long sum = 0; +# cnt *= 8; +# while(cnt--) +# sum += *a++ + *b++; +# return sum; +# } +# a and b must be 128-bit aligned +# Copyright 2001, Phil Karn KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + .global dotprod_sse2_assist + .type dotprod_sse2_assist,@function +dotprod_sse2_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %ecx + pushl %ebx + movl 8(%ebp),%esi # a + movl 12(%ebp),%edi # b + movl 16(%ebp),%ecx # cnt + pxor %xmm0,%xmm0 # clear running sum (in two 32-bit halves) + +# SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop + .align 16 +.Loop1: subl $4,%ecx + jl .Loop1Done + + movdqa (%esi),%xmm1 + pmaddwd (%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 16(%esi),%xmm1 + pmaddwd 16(%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 32(%esi),%xmm1 + pmaddwd 32(%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 48(%esi),%xmm1 + addl $64,%esi + pmaddwd 48(%edi),%xmm1 + addl $64,%edi + paddd %xmm1,%xmm0 + + jmp .Loop1 +.Loop1Done: + + addl $4,%ecx + +# SSE2 dot product loop, not unrolled, crunching 4 terms per loop +# This could be redone as Duff's Device on the unrolled loop above +.Loop2: subl $1,%ecx + jl .Loop2Done + + movdqa (%esi),%xmm1 + addl $16,%esi + pmaddwd (%edi),%xmm1 + addl $16,%edi + paddd %xmm1,%xmm0 + jmp .Loop2 +.Loop2Done: + + movdqa %xmm0,%xmm1 + psrldq $8,%xmm0 + paddd %xmm1,%xmm0 + movd %xmm0,%eax # right-hand word to eax + psrldq $4,%xmm0 + movd %xmm0,%ebx + addl %ebx,%eax + + popl %ebx + popl %ecx + popl %edi + popl %esi + movl %ebp,%esp + popl %ebp + ret diff --git a/libfec/dsp.3 b/libfec/dsp.3 new file mode 100644 index 0000000..e9794da --- /dev/null +++ b/libfec/dsp.3 @@ -0,0 +1,63 @@ +.TH DSP 3 +.SH NAME +initdp, freedp, dotprod, sumsq, peakval -\ SIMD-assisted +digital signal processing primitives +.SH SYNOPSIS +.nf +.ft +#include "fec.h" + +void *initdp(signed short *coeffs,int len); +long dotprod(void *p,signed short *a); +void freedp(void *p); + +unsigned long long sumsq(signed short *in,int cnt); + +int peakval(signed short *b,int cnt); + +.SH DESCRIPTION +These functions provide several basic primitives useful in digital +signal processing (DSP), especially in modems. The \fBinitdp\fR, +\fBdotprod\fR and \fBfreedp\fR functions implement an integer dot +product useful in correlation and filtering operations on signed +16-bit integers. \fBsumsq\fR computes the sum +of the squares of an array of signed 16-bit integers, +useful for measuring the energy of a signal. \fBpeakval\fR returns the +absolute value of the largest magitude element in the input array, +useful for scaling a signal's amplitude. + +Each function uses IA32 or PowerPC Altivec instructions when +available; otherwise, a portable C version is used. + +.SH USAGE +To create a FIR filter or correlator, call \fBinitdp\fR with the +coefficients in \fBcoeff\fR and their number in \fBlen\fR. This +creates the appropriate data structures and returns a handle. + +To compute a dot product, pass the handle from \fBinitdp\fR and the +input array to \fBdotprod\fR. No length field is needed as the number +of samples will be taken from the \fBlen\fR parameter originally given +to \fBinitdp\fR. There must be at least as many samples in the input +array as there were coefficients passed to \fBinitdp\fR. + +When the filter or correlator is no longer needed, the data structures +may be freed by passing the handle to \fBfreedp\fR. + +The user is responsible for scaling the inputs to \fBinitdp\fR and +\fBdotprod\fR, as the 32-bit result from \fBdotprod\fR will silently +wrap around in the event of overflow. + +To compute the sum of the squares of an array of signed 16-bit +integers, use sumsq\fR. This returns a 64 bit sum. + +\fBpeakval\fR computes the absolute value of each 16-bit element in +the input array and returns the largest. + +.SH RETURN VALUES + +\fBinitdp\fR returns a handle that points to a control block, or NULL in +the event of an error (such as a memory allocation failure). \fBsumsq\fR +and \fBpeakval\fR have no error returns. + +.SH AUTHOR and COPYRIGHT +Phil Karn, KA9Q (karn@ka9q.net) diff --git a/libfec/dtest.c b/libfec/dtest.c new file mode 100644 index 0000000..394cb03 --- /dev/null +++ b/libfec/dtest.c @@ -0,0 +1,99 @@ +/* Test dot-product function */ + +#include +#include +#include +#include +#include "config.h" +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {"trials",0,NULL,'n'}, + {NULL}, +}; +#endif + +int main(int argc,char *argv[]){ + short coeffs[512]; + short input[2048]; + int trials=1000,d; + int errors = 0; + +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"apmstn:",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"apmstn:")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'n': + trials = atoi(optarg); + break; + } + } + + while(trials--){ + long port_result; + long simd_result; + int ntaps; + int i; + int csum = 0; + int offset; + void *dp_simd,*dp_port; + + /* Generate set of coefficients + * limit sum of absolute values to 32767 to avoid overflow + */ + memset(coeffs,0,sizeof(coeffs)); + for(i=0;i<512;i++){ + double gv; + + gv = normal_rand(0.,100.); + if(csum + fabs(gv) > 32767) + break; + coeffs[i] = gv; + csum += fabs(gv); + } + ntaps = i; + + /* Compare results to portable C version for a bunch of random data buffers and offsets */ + dp_simd = initdp(coeffs,ntaps); + dp_port = initdp_port(coeffs,ntaps); + + for(i=0;i<2048;i++) + input[i] = random(); + + offset = random() & 511; + + simd_result = dotprod(dp_simd,input+offset); + port_result = dotprod_port(dp_port,input+offset); + if(simd_result != port_result){ + errors++; + } + } + printf("dtest: %d errors\n",errors); + exit(0); +} diff --git a/libfec/encode_rs.c b/libfec/encode_rs.c new file mode 100644 index 0000000..0649094 --- /dev/null +++ b/libfec/encode_rs.c @@ -0,0 +1,52 @@ +/* Reed-Solomon encoder + * Copyright 2002, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include + +#ifdef FIXED +#include "fixed.h" +#elif defined(BIGSYM) +#include "int.h" +#else +#include "char.h" +#endif + +void ENCODE_RS( +#ifdef FIXED +data_t *data, data_t *bb,int pad){ +#else +void *p,data_t *data, data_t *bb){ + struct rs *rs = (struct rs *)p; +#endif + int i, j; + data_t feedback; + +#ifdef FIXED + /* Check pad parameter for validity */ + if(pad < 0 || pad >= NN) + return; +#endif + + memset(bb,0,NROOTS*sizeof(data_t)); + + for(i=0;i) must be included by the calling + * program. + + * Copyright 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + + +#undef A0 +#define A0 (NN) /* Special reserved value encoding zero in index form */ + +{ + int i, j; + data_t feedback; + + memset(parity,0,NROOTS*sizeof(data_t)); + + for(i=0;i +#include "fixed.h" +#ifdef __VEC__ +#include +#endif + + +static enum {UNKNOWN=0,MMX,SSE,SSE2,ALTIVEC,PORT} cpu_mode; + +static void encode_rs_8_c(data_t *data, data_t *parity,int pad); +#if __vec__ +static void encode_rs_8_av(data_t *data, data_t *parity,int pad); +#endif +#if __i386__ +int cpu_features(void); +#endif + +void encode_rs_8(data_t *data, data_t *parity,int pad){ + if(cpu_mode == UNKNOWN){ +#ifdef __i386__ + int f; + /* Figure out what kind of CPU we have */ + f = cpu_features(); + if(f & (1<<26)){ /* SSE2 is present */ + cpu_mode = SSE2; + } else if(f & (1<<25)){ /* SSE is present */ + cpu_mode = SSE; + } else if(f & (1<<23)){ /* MMX is present */ + cpu_mode = MMX; + } else { /* No SIMD at all */ + cpu_mode = PORT; + } +#elif __x86_64__ + cpu_mode = SSE2; +#elif __VEC__ + /* Ask the OS if we have Altivec support */ + int selectors[2] = { CTL_HW, HW_VECTORUNIT }; + int hasVectorUnit = 0; + size_t length = sizeof(hasVectorUnit); + int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); + if(0 == error && hasVectorUnit) + cpu_mode = ALTIVEC; + else + cpu_mode = PORT; +#else + cpu_mode = PORT; +#endif + } + switch(cpu_mode){ +#if __vec__ + case ALTIVEC: + encode_rs_8_av(data,parity,pad); + return; +#endif + +#if __i386__ + case MMX: + case SSE: + case SSE2: +#endif + +#ifdef __x86_64__ + case SSE2: +#endif + + default: + encode_rs_8_c(data,parity,pad); + return; + } +} + +#if __vec__ /* PowerPC G4/G5 Altivec instructions are available */ + +static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); +static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30); + +/* Lookup table for feedback multiplications + * These are the low half of the coefficients. Since the generator polynomial is + * palindromic, we form the other half by reversing this one + */ +extern static union { vector unsigned char v; unsigned char c[16]; } table[256]; + +static void encode_rs_8_av(data_t *data, data_t *parity,int pad){ + union { vector unsigned char v[2]; unsigned char c[32]; } shift_register; + int i; + + shift_register.v[0] = (vector unsigned char)(0); + shift_register.v[1] = (vector unsigned char)(0); + + for(i=0;i +#include +#include "fixed.h" + +/* Lookup table for feedback multiplications + * These are the low half of the coefficients. Since the generator polynomial is + * palindromic, we form it by reversing these on the fly + */ +static union { vector unsigned char v; unsigned char c[16]; } table[256]; + +static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); +static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30); + +extern data_t CCSDS_alpha_to[]; +extern data_t CCSDS_index_of[]; +extern data_t CCSDS_poly[]; + +void rs_init_av(){ + int i,j; + + /* The PowerPC is big-endian, so the low-order byte of each vector contains the highest order term in the polynomial */ + for(j=0;j<16;j++){ + table[0].c[j] = 0; + for(i=1;i<256;i++){ + table[i].c[16-j-1] = CCSDS_alpha_to[MODNN(CCSDS_poly[j+1] + CCSDS_index_of[i])]; + } + } +#if 0 + for(i=0;i<256;i++){ + printf("table[%3d] = %3vu\n",i,table[i].v); + } +#endif +} + +void encode_rs_av(unsigned char *data,unsigned char *parity,int pad){ + union { vector unsigned char v[2]; unsigned char c[32]; } shift_register; + int i; + + shift_register.v[0] = (vector unsigned char)(0); + shift_register.v[1] = (vector unsigned char)(0); + + for(i=0;i + +#include "char.h" +#include "rs-common.h" + +void encode_rs_char(void *p,data_t *data, data_t *parity){ + struct rs *rs = (struct rs *)p; + +#include "encode_rs.h" + +} diff --git a/libfec/encode_rs_int.c b/libfec/encode_rs_int.c new file mode 100644 index 0000000..3c9ce78 --- /dev/null +++ b/libfec/encode_rs_int.c @@ -0,0 +1,15 @@ +/* Reed-Solomon encoder + * Copyright 2003, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include + +#include "int.h" +#include "rs-common.h" + +void encode_rs_int(void *p,data_t *data, data_t *parity){ + struct rs *rs = (struct rs *)p; + +#include "encode_rs.h" + +} diff --git a/libfec/exercise.c b/libfec/exercise.c new file mode 100644 index 0000000..8ae008c --- /dev/null +++ b/libfec/exercise.c @@ -0,0 +1,122 @@ +/* Exercise an RS codec a specified number of times using random + * data and error patterns + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#define FLAG_ERASURE 1 /* Randomly flag 50% of errors as erasures */ + +#include +#include +#include + +#ifdef FIXED +#include "fixed.h" +#define EXERCISE exercise_8 +#elif defined(CCSDS) +#include "fixed.h" +#include "ccsds.h" +#define EXERCISE exercise_ccsds +#elif defined(BIGSYM) +#include "int.h" +#define EXERCISE exercise_int +#else +#include "char.h" +#define EXERCISE exercise_char +#endif + +#ifdef FIXED +#define PRINTPARM printf("(255,223):"); +#elif defined(CCSDS) +#define PRINTPARM printf("CCSDS (255,223):"); +#else +#define PRINTPARM printf("(%d,%d):",rs->nn,rs->nn-rs->nroots); +#endif + +/* Exercise the RS codec passed as an argument */ +int EXERCISE( +#if !defined(CCSDS) && !defined(FIXED) +void *p, +#endif +int trials){ +#if !defined(CCSDS) && !defined(FIXED) + struct rs *rs = (struct rs *)p; +#endif + data_t block[NN],tblock[NN]; + int i; + int errors; + int errlocs[NN]; + int derrlocs[NROOTS]; + int derrors; + int errval,errloc; + int erasures; + int decoder_errors = 0; + + while(trials-- != 0){ + /* Test up to the error correction capacity of the code */ + for(errors=0;errors <= NROOTS/2;errors++){ + + /* Load block with random data and encode */ + for(i=0;i +#include "fec.h" + +unsigned char Partab[256]; +int P_init; + +/* Create 256-entry odd-parity lookup table + * Needed only on non-ia32 machines + */ +void partab_init(void){ + int i,cnt,ti; + + /* Initialize parity lookup table */ + for(i=0;i<256;i++){ + cnt = 0; + ti = i; + while(ti){ + if(ti & 1) + cnt++; + ti >>= 1; + } + Partab[i] = cnt & 1; + } + P_init=1; +} + +/* Lookup table giving count of 1 bits for integers 0-255 */ +int Bitcnt[] = { + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8, +}; + diff --git a/libfec/fec.h b/libfec/fec.h new file mode 100644 index 0000000..d6d4b08 --- /dev/null +++ b/libfec/fec.h @@ -0,0 +1,355 @@ +/* User include file for libfec + * Copyright 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifndef _FEC_H_ +#define _FEC_H_ + +/* r=1/2 k=7 convolutional encoder polynomials + * The NASA-DSN convention is to use V27POLYA inverted, then V27POLYB + * The CCSDS/NASA-GSFC convention is to use V27POLYB, then V27POLYA inverted + */ +#define V27POLYA 0x6d +#define V27POLYB 0x4f + +void *create_viterbi27(int len); +void set_viterbi27_polynomial(int polys[2]); +int init_viterbi27(void *vp,int starting_state); +int update_viterbi27_blk(void *vp,unsigned char sym[],int npairs); +int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27(void *vp); + +#ifdef __VEC__ +void *create_viterbi27_av(int len); +void set_viterbi27_polynomial_av(int polys[2]); +int init_viterbi27_av(void *p,int starting_state); +int chainback_viterbi27_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_av(void *p); +int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi27_mmx(int len); +void set_viterbi27_polynomial_mmx(int polys[2]); +int init_viterbi27_mmx(void *p,int starting_state); +int chainback_viterbi27_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_mmx(void *p); +int update_viterbi27_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi27_sse(int len); +void set_viterbi27_polynomial_sse(int polys[2]); +int init_viterbi27_sse(void *p,int starting_state); +int chainback_viterbi27_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_sse(void *p); +int update_viterbi27_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi27_sse2(int len); +void set_viterbi27_polynomial_sse2(int polys[2]); +int init_viterbi27_sse2(void *p,int starting_state); +int chainback_viterbi27_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_sse2(void *p); +int update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits); +#endif + +void *create_viterbi27_port(int len); +void set_viterbi27_polynomial_port(int polys[2]); +int init_viterbi27_port(void *p,int starting_state); +int chainback_viterbi27_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_port(void *p); +int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits); + +/* r=1/2 k=9 convolutional encoder polynomials */ +#define V29POLYA 0x1af +#define V29POLYB 0x11d + +void *create_viterbi29(int len); +void set_viterbi29_polynomial(int polys[2]); +int init_viterbi29(void *vp,int starting_state); +int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29(void *vp); + +#ifdef __VEC__ +void *create_viterbi29_av(int len); +void set_viterbi29_polynomial_av(int polys[2]); +int init_viterbi29_av(void *p,int starting_state); +int chainback_viterbi29_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_av(void *p); +int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi29_mmx(int len); +void set_viterbi29_polynomial_mmx(int polys[2]); +int init_viterbi29_mmx(void *p,int starting_state); +int chainback_viterbi29_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_mmx(void *p); +int update_viterbi29_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi29_sse(int len); +void set_viterbi29_polynomial_sse(int polys[2]); +int init_viterbi29_sse(void *p,int starting_state); +int chainback_viterbi29_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_sse(void *p); +int update_viterbi29_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi29_sse2(int len); +void set_viterbi29_polynomial_sse2(int polys[2]); +int init_viterbi29_sse2(void *p,int starting_state); +int chainback_viterbi29_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_sse2(void *p); +int update_viterbi29_blk_sse2(void *p,unsigned char *syms,int nbits); +#endif + +void *create_viterbi29_port(int len); +void set_viterbi29_polynomial_port(int polys[2]); +int init_viterbi29_port(void *p,int starting_state); +int chainback_viterbi29_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_port(void *p); +int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits); + +/* r=1/3 k=9 convolutional encoder polynomials */ +#define V39POLYA 0x1ed +#define V39POLYB 0x19b +#define V39POLYC 0x127 + +void *create_viterbi39(int len); +void set_viterbi39_polynomial(int polys[3]); +int init_viterbi39(void *vp,int starting_state); +int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39(void *vp); + +#ifdef __VEC__ +void *create_viterbi39_av(int len); +void set_viterbi39_polynomial_av(int polys[3]); +int init_viterbi39_av(void *p,int starting_state); +int chainback_viterbi39_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_av(void *p); +int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi39_mmx(int len); +void set_viterbi39_polynomial_mmx(int polys[3]); +int init_viterbi39_mmx(void *p,int starting_state); +int chainback_viterbi39_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_mmx(void *p); +int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi39_sse(int len); +void set_viterbi39_polynomial_sse(int polys[3]); +int init_viterbi39_sse(void *p,int starting_state); +int chainback_viterbi39_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_sse(void *p); +int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi39_sse2(int len); +void set_viterbi39_polynomial_sse2(int polys[3]); +int init_viterbi39_sse2(void *p,int starting_state); +int chainback_viterbi39_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_sse2(void *p); +int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits); +#endif + +void *create_viterbi39_port(int len); +void set_viterbi39_polynomial_port(int polys[3]); +int init_viterbi39_port(void *p,int starting_state); +int chainback_viterbi39_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_port(void *p); +int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits); + + +/* r=1/6 k=15 Cassini convolutional encoder polynomials without symbol inversion + * dfree = 56 + * These bits may be left-right flipped from some textbook representations; + * here I have the bits entering the shift register from the right (low) end + * + * Some other spacecraft use the same code, but with the polynomials in a different order. + * E.g., Mars Pathfinder and STEREO swap POLYC and POLYD. All use alternate symbol inversion, + * so use set_viterbi615_polynomial() as appropriate. + */ +#define V615POLYA 042631 +#define V615POLYB 047245 +#define V615POLYC 056507 +#define V615POLYD 073363 +#define V615POLYE 077267 +#define V615POLYF 064537 + +void *create_viterbi615(int len); +void set_viterbi615_polynomial(int polys[6]); +int init_viterbi615(void *vp,int starting_state); +int update_viterbi615_blk(void *vp,unsigned char *syms,int nbits); +int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615(void *vp); + +#ifdef __VEC__ +void *create_viterbi615_av(int len); +void set_viterbi615_polynomial_av(int polys[6]); +int init_viterbi615_av(void *p,int starting_state); +int chainback_viterbi615_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_av(void *p); +int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi615_mmx(int len); +void set_viterbi615_polynomial_mmx(int polys[6]); +int init_viterbi615_mmx(void *p,int starting_state); +int chainback_viterbi615_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_mmx(void *p); +int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi615_sse(int len); +void set_viterbi615_polynomial_sse(int polys[6]); +int init_viterbi615_sse(void *p,int starting_state); +int chainback_viterbi615_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_sse(void *p); +int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi615_sse2(int len); +void set_viterbi615_polynomial_sse2(int polys[6]); +int init_viterbi615_sse2(void *p,int starting_state); +int chainback_viterbi615_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_sse2(void *p); +int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits); +#endif + +void *create_viterbi615_port(int len); +void set_viterbi615_polynomial_port(int polys[6]); +int init_viterbi615_port(void *p,int starting_state); +int chainback_viterbi615_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_port(void *p); +int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits); + + +/* General purpose RS codec, 8-bit symbols */ +void encode_rs_char(void *rs,unsigned char *data,unsigned char *parity); +int decode_rs_char(void *rs,unsigned char *data,int *eras_pos, + int no_eras); +void *init_rs_char(int symsize,int gfpoly, + int fcr,int prim,int nroots, + int pad); +void free_rs_char(void *rs); + +/* General purpose RS codec, integer symbols */ +void encode_rs_int(void *rs,int *data,int *parity); +int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras); +void *init_rs_int(int symsize,int gfpoly,int fcr, + int prim,int nroots,int pad); +void free_rs_int(void *rs); + +/* CCSDS standard (255,223) RS codec with conventional (*not* dual-basis) + * symbol representation + */ +void encode_rs_8(unsigned char *data,unsigned char *parity,int pad); +int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras,int pad); + +/* CCSDS standard (255,223) RS codec with dual-basis symbol representation */ +void encode_rs_ccsds(unsigned char *data,unsigned char *parity,int pad); +int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras,int pad); + +/* Tables to map from conventional->dual (Taltab) and + * dual->conventional (Tal1tab) bases + */ +extern unsigned char Taltab[],Tal1tab[]; + + +/* CPU SIMD instruction set available */ +extern enum cpu_mode {UNKNOWN=0,PORT,MMX,SSE,SSE2,ALTIVEC} Cpu_mode; +void find_cpu_mode(void); /* Call this once at startup to set Cpu_mode */ + +/* Determine parity of argument: 1 = odd, 0 = even */ +#if defined(__i386__) || defined(__x86_64__) +static inline int parityb(unsigned char x){ + __asm__ __volatile__ ("test %1,%1;setpo %0" : "=q"(x) : "q" (x)); + return x; +} +#else +void partab_init(); + +static inline int parityb(unsigned char x){ + extern unsigned char Partab[256]; + extern int P_init; + if(!P_init){ + partab_init(); + } + return Partab[x]; +} +#endif + + +static inline int parity(int x){ + /* Fold down to one byte */ + x ^= (x >> 16); + x ^= (x >> 8); + return parityb(x); +} + +/* Useful utilities for simulation */ +double normal_rand(double mean, double std_dev); +unsigned char addnoise(int sym,double amp,double gain,double offset,int clip); + +extern int Bitcnt[]; + +/* Dot product functions */ +void *initdp(signed short coeffs[],int len); +void freedp(void *dp); +long dotprod(void *dp,signed short a[]); + +void *initdp_port(signed short coeffs[],int len); +void freedp_port(void *dp); +long dotprod_port(void *dp,signed short a[]); + +#ifdef __i386__ +void *initdp_mmx(signed short coeffs[],int len); +void freedp_mmx(void *dp); +long dotprod_mmx(void *dp,signed short a[]); + +void *initdp_sse(signed short coeffs[],int len); +void freedp_sse(void *dp); +long dotprod_sse(void *dp,signed short a[]); + +void *initdp_sse2(signed short coeffs[],int len); +void freedp_sse2(void *dp); +long dotprod_sse2(void *dp,signed short a[]); +#endif + +#ifdef __x86_64__ +void *initdp_sse2(signed short coeffs[],int len); +void freedp_sse2(void *dp); +long dotprod_sse2(void *dp,signed short a[]); +#endif + +#ifdef __VEC__ +void *initdp_av(signed short coeffs[],int len); +void freedp_av(void *dp); +long dotprod_av(void *dp,signed short a[]); +#endif + +/* Sum of squares - accepts signed shorts, produces unsigned long long */ +unsigned long long sumsq(signed short *in,int cnt); +unsigned long long sumsq_port(signed short *in,int cnt); + +#ifdef __i386__ +unsigned long long sumsq_mmx(signed short *in,int cnt); +unsigned long long sumsq_sse(signed short *in,int cnt); +unsigned long long sumsq_sse2(signed short *in,int cnt); +#endif +#ifdef __x86_64__ +unsigned long long sumsq_sse2(signed short *in,int cnt); +#endif +#ifdef __VEC__ +unsigned long long sumsq_av(signed short *in,int cnt); +#endif + + +/* Low-level data structures and routines */ + +int cpu_features(void); + +#endif /* _FEC_H_ */ + + + diff --git a/libfec/fixed.h b/libfec/fixed.h new file mode 100644 index 0000000..0ff27b2 --- /dev/null +++ b/libfec/fixed.h @@ -0,0 +1,33 @@ +/* Stuff specific to the CCSDS (255,223) RS codec + * (255,223) code over GF(256). Note: the conventional basis is still + * used; the dual-basis mappings are performed in [en|de]code_rs_ccsds.c + * + * Copyright 2003 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +typedef unsigned char data_t; + +static inline int mod255(int x){ + while (x >= 255) { + x -= 255; + x = (x >> 8) + (x & 255); + } + return x; +} +#define MODNN(x) mod255(x) + +extern data_t CCSDS_alpha_to[]; +extern data_t CCSDS_index_of[]; +extern data_t CCSDS_poly[]; + +#define MM 8 +#define NN 255 +#define ALPHA_TO CCSDS_alpha_to +#define INDEX_OF CCSDS_index_of +#define GENPOLY CCSDS_poly +#define NROOTS 32 +#define FCR 112 +#define PRIM 11 +#define IPRIM 116 +#define PAD pad + diff --git a/libfec/gen_ccsds.c b/libfec/gen_ccsds.c new file mode 100644 index 0000000..e1e2e26 --- /dev/null +++ b/libfec/gen_ccsds.c @@ -0,0 +1,39 @@ +/* Generate tables for CCSDS code + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include "char.h" +#include "rs-common.h" +#include "fec.h" + +int main(){ + struct rs *rs; + int i; + + rs = init_rs_char(8,0x187,112,11,32,0); /* CCSDS standard */ + assert(rs != NULL); + printf("char CCSDS_alpha_to[] = {"); + for(i=0;i<256;i++){ + if((i % 16) == 0) + printf("\n"); + printf("0x%02x,",rs->alpha_to[i]); + } + printf("\n};\n\nchar CCSDS_index_of[] = {"); + for(i=0;i<256;i++){ + if((i % 16) == 0) + printf("\n"); + printf("%3d,",rs->index_of[i]); + } + printf("\n};\n\nchar CCSDS_poly[] = {"); + for(i=0;i<33;i++){ + if((i % 16) == 0) + printf("\n"); + + printf("%3d,",rs->genpoly[i]); + } + printf("\n};\n"); + exit(0); +} diff --git a/libfec/gen_ccsds_tal.c b/libfec/gen_ccsds_tal.c new file mode 100644 index 0000000..fc75503 --- /dev/null +++ b/libfec/gen_ccsds_tal.c @@ -0,0 +1,53 @@ +/* Conversion lookup tables from conventional alpha to Berlekamp's + * dual-basis representation. Used in the CCSDS version only. + * taltab[] -- convert conventional to dual basis + * tal1tab[] -- convert dual basis to conventional + + * Note: the actual RS encoder/decoder works with the conventional basis. + * So data is converted from dual to conventional basis before either + * encoding or decoding and then converted back. + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include + +#define DTYPE unsigned char +DTYPE Taltab[256],Tal1tab[256]; + +static DTYPE tal[] = { 0x8d, 0xef, 0xec, 0x86, 0xfa, 0x99, 0xaf, 0x7b }; + +/* Generate conversion lookup tables between conventional alpha representation + * (@**7, @**6, ...@**0) + * and Berlekamp's dual basis representation + * (l0, l1, ...l7) + */ +int main(){ + int i,j,k; + + for(i=0;i<256;i++){/* For each value of input */ + Taltab[i] = 0; + for(j=0;j<8;j++) /* for each column of matrix */ + for(k=0;k<8;k++){ /* for each row of matrix */ + if(i & (1< +#include "fec.h" + +#if !defined(NULL) +#define NULL ((void *)0) +#endif + +#include "rs-common.h" + +void free_rs(void *p){ + struct rs *rs = (struct rs *)p; + + free(rs->alpha_to); + free(rs->index_of); + free(rs->genpoly); + free(rs); +} + +/* Initialize a Reed-Solomon codec + * symsize = symbol size, bits + * gfpoly = Field generator polynomial coefficients + * fcr = first root of RS code generator polynomial, index form + * prim = primitive element to generate polynomial roots + * nroots = RS code generator polynomial degree (number of roots) + * pad = padding bytes at front of shortened block + */ +void *init_rs_common(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad){ + struct rs *rs; + +#include "init_rs.h" + + return rs; +} diff --git a/libfec/init_rs.h b/libfec/init_rs.h new file mode 100644 index 0000000..2b2ae98 --- /dev/null +++ b/libfec/init_rs.h @@ -0,0 +1,106 @@ +/* Common code for intializing a Reed-Solomon control block (char or int symbols) + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#undef NULL +#define NULL ((void *)0) + +{ + int i, j, sr,root,iprim; + + rs = NULL; + /* Check parameter ranges */ + if(symsize < 0 || symsize > 8*sizeof(data_t)){ + goto done; + } + + if(fcr < 0 || fcr >= (1<= (1<= (1<= ((1<mm = symsize; + rs->nn = (1<pad = pad; + + rs->alpha_to = (data_t *)malloc(sizeof(data_t)*(rs->nn+1)); + if(rs->alpha_to == NULL){ + free(rs); + rs = NULL; + goto done; + } + rs->index_of = (data_t *)malloc(sizeof(data_t)*(rs->nn+1)); + if(rs->index_of == NULL){ + free(rs->alpha_to); + free(rs); + rs = NULL; + goto done; + } + + /* Generate Galois field lookup tables */ + rs->index_of[0] = A0; /* log(zero) = -inf */ + rs->alpha_to[A0] = 0; /* alpha**-inf = 0 */ + sr = 1; + for(i=0;inn;i++){ + rs->index_of[sr] = i; + rs->alpha_to[i] = sr; + sr <<= 1; + if(sr & (1<nn; + } + if(sr != 1){ + /* field generator polynomial is not primitive! */ + free(rs->alpha_to); + free(rs->index_of); + free(rs); + rs = NULL; + goto done; + } + + /* Form RS code generator polynomial from its roots */ + rs->genpoly = (data_t *)malloc(sizeof(data_t)*(nroots+1)); + if(rs->genpoly == NULL){ + free(rs->alpha_to); + free(rs->index_of); + free(rs); + rs = NULL; + goto done; + } + rs->fcr = fcr; + rs->prim = prim; + rs->nroots = nroots; + + /* Find prim-th root of 1, used in decoding */ + for(iprim=1;(iprim % prim) != 0;iprim += rs->nn) + ; + rs->iprim = iprim / prim; + + rs->genpoly[0] = 1; + for (i = 0,root=fcr*prim; i < nroots; i++,root += prim) { + rs->genpoly[i+1] = 1; + + /* Multiply rs->genpoly[] by @**(root + x) */ + for (j = i; j > 0; j--){ + if (rs->genpoly[j] != 0) + rs->genpoly[j] = rs->genpoly[j-1] ^ rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[j]] + root)]; + else + rs->genpoly[j] = rs->genpoly[j-1]; + } + /* rs->genpoly[0] can never be zero */ + rs->genpoly[0] = rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[0]] + root)]; + } + /* convert rs->genpoly[] to index form for quicker encoding */ + for (i = 0; i <= nroots; i++) + rs->genpoly[i] = rs->index_of[rs->genpoly[i]]; + done:; + +} diff --git a/libfec/init_rs_char.c b/libfec/init_rs_char.c new file mode 100644 index 0000000..a51099a --- /dev/null +++ b/libfec/init_rs_char.c @@ -0,0 +1,35 @@ +/* Initialize a RS codec + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include + +#include "char.h" +#include "rs-common.h" + +void free_rs_char(void *p){ + struct rs *rs = (struct rs *)p; + + free(rs->alpha_to); + free(rs->index_of); + free(rs->genpoly); + free(rs); +} + +/* Initialize a Reed-Solomon codec + * symsize = symbol size, bits + * gfpoly = Field generator polynomial coefficients + * fcr = first root of RS code generator polynomial, index form + * prim = primitive element to generate polynomial roots + * nroots = RS code generator polynomial degree (number of roots) + * pad = padding bytes at front of shortened block + */ +void *init_rs_char(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad){ + struct rs *rs; + +#include "init_rs.h" + + return rs; +} diff --git a/libfec/init_rs_int.c b/libfec/init_rs_int.c new file mode 100644 index 0000000..a6036c2 --- /dev/null +++ b/libfec/init_rs_int.c @@ -0,0 +1,35 @@ +/* Initialize a RS codec + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include + +#include "int.h" +#include "rs-common.h" + +void free_rs_int(void *p){ + struct rs *rs = (struct rs *)p; + + free(rs->alpha_to); + free(rs->index_of); + free(rs->genpoly); + free(rs); +} + +/* Initialize a Reed-Solomon codec + * symsize = symbol size, bits + * gfpoly = Field generator polynomial coefficients + * fcr = first root of RS code generator polynomial, index form + * prim = primitive element to generate polynomial roots + * nroots = RS code generator polynomial degree (number of roots) + * pad = padding bytes at front of shortened block + */ +void *init_rs_int(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad){ + struct rs *rs; + +#include "init_rs.h" + + return rs; +} diff --git a/libfec/install-sh b/libfec/install-sh new file mode 100755 index 0000000..e9de238 --- /dev/null +++ b/libfec/install-sh @@ -0,0 +1,251 @@ +#!/bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5 (mit/util/scripts/install.sh). +# +# Copyright 1991 by the Massachusetts Institute of Technology +# +# Permission to use, copy, modify, distribute, and sell this software and its +# documentation for any purpose is hereby granted without fee, provided that +# the above copyright notice appear in all copies and that both that +# copyright notice and this permission notice appear in supporting +# documentation, and that the name of M.I.T. not be used in advertising or +# publicity pertaining to distribution of the software without specific, +# written prior permission. M.I.T. makes no representations about the +# suitability of this software for any purpose. It is provided "as is" +# without express or implied warranty. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. It can only install one file at a time, a restriction +# shared with many OS's install programs. + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +transformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + chmodcmd="" + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/libfec/int.h b/libfec/int.h new file mode 100644 index 0000000..46e865d --- /dev/null +++ b/libfec/int.h @@ -0,0 +1,22 @@ +/* Stuff specific to the general (integer) version of the Reed-Solomon codecs + * + * Copyright 2003, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +typedef unsigned int data_t; + +#define MODNN(x) modnn(rs,x) + +#define MM (rs->mm) +#define NN (rs->nn) +#define ALPHA_TO (rs->alpha_to) +#define INDEX_OF (rs->index_of) +#define GENPOLY (rs->genpoly) +#define NROOTS (rs->nroots) +#define FCR (rs->fcr) +#define PRIM (rs->prim) +#define IPRIM (rs->iprim) +#define PAD (rs->pad) +#define A0 (NN) + + diff --git a/libfec/lesser.txt b/libfec/lesser.txt new file mode 100644 index 0000000..b1e3f5a --- /dev/null +++ b/libfec/lesser.txt @@ -0,0 +1,504 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/libfec/libfec.pc.in b/libfec/libfec.pc.in new file mode 100644 index 0000000..c569da9 --- /dev/null +++ b/libfec/libfec.pc.in @@ -0,0 +1,13 @@ +prefix=@LIBFEC_PC_PREFIX@ +exec_prefix=@LIBFEC_PC_EXEC_PREFIX@ +libdir=@LIBFEC_PC_LIBDIR@ +includedir=@LIBFEC_PC_INCLUDEDIR@ + +Name: FEC library +Description: A fork of KA9Q's FEC library +Version: @LIBFEC_PC_VERSION@ +URL: http://opendigitalradio.org +Cflags: -I${includedir}/ @LIBFEC_PC_CFLAGS@ +Libs: -L${libdir}/ @LIBFEC_PC_LIBS@ +Libs.private: @LIBFEC_PC_PRIV_LIBS@ + diff --git a/libfec/makefile.in b/libfec/makefile.in new file mode 100644 index 0000000..cc116ab --- /dev/null +++ b/libfec/makefile.in @@ -0,0 +1,249 @@ +# Makefile prototype for configure +# Copyright 2004 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + +# @configure_input@ +srcdir = @srcdir@ +prefix = @prefix@ +exec_prefix=@exec_prefix@ +VPATH = @srcdir@ +CC=@CC@ +LIBS=@MLIBS@ fec.o sim.o viterbi27.o viterbi27_port.o viterbi29.o viterbi29_port.o \ + viterbi39.o viterbi39_port.o \ + viterbi615.o viterbi615_port.o encode_rs_char.o encode_rs_int.o encode_rs_8.o \ + decode_rs_char.o decode_rs_int.o decode_rs_8.o \ + init_rs_char.o init_rs_int.o ccsds_tab.o \ + encode_rs_ccsds.o decode_rs_ccsds.o ccsds_tal.o \ + dotprod.o dotprod_port.o \ + peakval.o peakval_port.o \ + sumsq.o sumsq_port.o + +CFLAGS=@CFLAGS@ -I. -fPIC -Wall @ARCH_OPTION@ + +SHARED_LIB=@SH_LIB@ + +all: libfec.a $(SHARED_LIB) + + +test: vtest27 vtest29 vtest39 vtest615 rstest dtest sumsq_test peaktest + @echo "Correctness tests:" + ./vtest27 -e 3.0 -n 1000 -v + ./vtest29 -e 2.5 -n 1000 -v + ./vtest39 -e 2.5 -n 1000 -v + ./vtest615 -e 1.0 -n 100 -v + ./rstest + ./dtest + ./sumsq_test + ./peaktest + @echo "Speed tests:" + ./vtest27 + ./vtest29 + ./vtest39 + ./vtest615 + +install: all + mkdir -p $(DESTDIR)@libdir@ + install -m 644 -p $(SHARED_LIB) libfec.a $(DESTDIR)@libdir@ +# (cd $(DESTDIR)@libdir@;ln -f -s $(SHARED_LIB) libfec.so) + @REBIND@ + mkdir -p $(DESTDIR)@includedir@ + install -m 644 -p fec.h $(DESTDIR)@includedir@ + mkdir -m 0755 -p $(DESTDIR)@mandir@/man3 + install -m 644 -p simd-viterbi.3 rs.3 dsp.3 $(DESTDIR)@mandir@/man3 + +peaktest: peaktest.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ + +sumsq_test: sumsq_test.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ + +dtest: dtest.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ -lm + +vtest27: vtest27.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ -lm + +vtest29: vtest29.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ -lm + +vtest39: vtest39.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ -lm + +vtest615: vtest615.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ -lm + +rstest: rstest.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ + +rs_speedtest: rs_speedtest.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ + +# for some reason, the test programs without args segfault on the PPC with -O2 optimization. Dunno why - compiler bug? +vtest27.o: vtest27.c fec.h + gcc $(CFLAGS) -g -c $< + +vtest29.o: vtest29.c fec.h + gcc $(CFLAGS) -g -c $< + +vtest39.o: vtest39.c fec.h + gcc $(CFLAGS) -g -c $< + +vtest615.o: vtest615.c fec.h + gcc $(CFLAGS) -g -c $< + +libfec.a: $(LIBS) + ar rv $@ $^ + ranlib libfec.a + +# for Darwin +libfec.dylib: $(LIBS) + $(CC) -dynamiclib -install_name $@ -o $@ $^ + +# for Linux et al +libfec.so: $(LIBS) + gcc -fPIC -shared -Xlinker -soname=$@ -o $@ -Wl,-whole-archive $^ -Wl,-no-whole-archive -lc -lm + +dotprod.o: dotprod.c fec.h + +dotprod_port.o: dotprod_port.c fec.h + +viterbi27.o: viterbi27.c fec.h + +viterbi27_port.o: viterbi27_port.c fec.h + +viterbi29.o: viterbi29.c fec.h + +viterbi39.o: viterbi39.c fec.h + +viterbi39_port.o: viterbi39_port.c fec.h + +viterbi39_sse2.o: viterbi39_sse2.c fec.h + +viterbi39_sse.o: viterbi39_sse.c fec.h + +viterbi39_mmx.o: viterbi39_mmx.c fec.h + +encode_rs_char.o: encode_rs_char.c char.h rs-common.h + +encode_rs_int.o: encode_rs_int.c int.h rs-common.h + +encode_rs_8.o: encode_rs_8.c fixed.h + +encode_rs_av.o: encode_rs_av.c fixed.h + +decode_rs_char.o: decode_rs_char.c char.h rs-common.h + +decode_rs_int.o: decode_rs_int.c int.h rs-common.h + +decode_rs_8.o: decode_rs_8.c fixed.h + +init_rs_char.o: init_rs_char.c char.h rs-common.h + +init_rs_int.o: init_rs_int.c int.h rs-common.h + +ccsds_tab.o: ccsds_tab.c + +ccsds_tab.c: gen_ccsds + ./gen_ccsds > ccsds_tab.c + +gen_ccsds: gen_ccsds.o init_rs_char.o + gcc $(CFLAGS) -o $@ $^ + +gen_ccsds.o: gen_ccsds.c + gcc $(CFLAGS) -c -o $@ $< + +ccsds_tal.o: ccsds_tal.c + +ccsds_tal.c: gen_ccsds_tal + ./gen_ccsds_tal > ccsds_tal.c + +exercise_char.o: exercise.c + gcc $(CFLAGS) -c -o $@ $< + +exercise_int.o: exercise.c + gcc -DBIGSYM=1 $(CFLAGS) -c -o $@ $< + +exercise_8.o: exercise.c + gcc -DFIXED=1 $(CFLAGS) -c -o $@ $< + +exercise_ccsds.o: exercise.c + gcc -DCCSDS=1 $(CFLAGS) -c -o $@ $< + +viterbi27.o: viterbi27.c fec.h + +viterbi27_port.o: viterbi27_port.c fec.h + +viterbi27_av.o: viterbi27_av.c fec.h + +viterbi27_mmx.o: viterbi27_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi27_sse.o: viterbi27_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi27_sse2.o: viterbi27_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +viterbi29.o: viterbi29.c fec.h + +viterbi29_port.o: viterbi29_port.c fec.h + +viterbi29_av.o: viterbi29_av.c fec.h + +viterbi29_mmx.o: viterbi29_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi29_sse.o: viterbi29_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi29_sse2.o: viterbi29_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +viterbi39.o: viterbi39.c fec.h + +viterbi39_port.o: viterbi39_port.c fec.h + +viterbi39_av.o: viterbi39_av.c fec.h + +viterbi39_mmx.o: viterbi39_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi39_sse.o: viterbi39_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi39_sse2.o: viterbi39_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +viterbi615.o: viterbi615.c fec.h + +viterbi615_port.o: viterbi615_port.c fec.h + +viterbi615_av.o: viterbi615_av.c fec.h + +viterbi615_mmx.o: viterbi615_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi615_sse.o: viterbi615_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi615_sse2.o: viterbi615_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +cpu_mode_x86.o: cpu_mode_x86.c fec.h + +cpu_mode_x86_64.o: cpu_mode_x86_64.c fec.h + +cpu_mode_ppc.o: cpu_mode_ppc.c fec.h + +#%.o: %.s +# $(AS) $< -o $@ + + + +clean: + rm -f *.o $(SHARED_LIB) *.a rs_speedtest peaktest sumsq_test dtest vtest27 vtest29 vtest39 vtest615 rstest ccsds_tab.c ccsds_tal.c gen_ccsds gen_ccsds_tal core + rm -rf autom4te.cache + +distclean: clean + rm -f config.log config.cache config.status config.h makefile + diff --git a/libfec/mmxbfly27.s b/libfec/mmxbfly27.s new file mode 100644 index 0000000..4abbf48 --- /dev/null +++ b/libfec/mmxbfly27.s @@ -0,0 +1,148 @@ +/* Intel SIMD MMX implementation of Viterbi ACS butterflies + for 64-state (k=7) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + int update_viterbi27_blk_mmx(struct v27 *vp,unsigned char *syms,int nbits) ; +*/ + # MMX (64-bit SIMD) version + # requires Pentium-MMX, Pentium-II or better + + # These are offsets into struct v27, defined in viterbi27_mmx.c + .set DP,128 + .set OLDMETRICS,132 + .set NEWMETRICS,136 + .text + .global update_viterbi27_blk_mmx,Mettab27_1,Mettab27_2 + .type update_viterbi27_blk_mmx,@function + .align 16 + +update_viterbi27_blk_mmx: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + movl 12(%ebp),%ebx # ebx = syms + movw (%ebx),%ax # ax = second symbol : first symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + movb %ah,%bl + andl $255,%eax + andl $255,%ebx + + # shift into first array index dimension slot + shll $5,%eax + shll $5,%ebx + + # each invocation of this macro will do 8 butterflies in parallel + .MACRO butterfly GROUP + # Compute branch metrics + movq (Mettab27_1+8*\GROUP)(%eax),%mm3 + movq fifteens,%mm0 + + paddb (Mettab27_2+8*\GROUP)(%ebx),%mm3 + paddb ones,%mm3 # emulate pavgb - this may not be necessary + psrlq $1,%mm3 + pand %mm0,%mm3 + + movq (8*\GROUP)(%esi),%mm6 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+32)(%esi),%mm2 # Incoming path metric, high bit = 1 + movq %mm6,%mm1 + movq %mm2,%mm7 + + paddb %mm3,%mm6 + paddb %mm3,%mm2 + pxor %mm0,%mm3 # invert branch metric + paddb %mm3,%mm7 # path metric for inverted symbols + paddb %mm3,%mm1 + + # live registers 1 2 6 7 + # Compare mm6 and mm7; mm1 and mm2 + pxor %mm3,%mm3 + movq %mm6,%mm4 + movq %mm1,%mm5 + psubb %mm7,%mm4 # mm4 = mm6 - mm7 + psubb %mm2,%mm5 # mm5 = mm1 - mm2 + pcmpgtb %mm3,%mm4 # mm4 = first set of decisions (ff = 1 better) + pcmpgtb %mm3,%mm5 # mm5 = second set of decisions + + # live registers 1 2 4 5 6 7 + # select survivors + movq %mm4,%mm0 + pand %mm4,%mm7 + movq %mm5,%mm3 + pand %mm5,%mm2 + pandn %mm6,%mm0 + pandn %mm1,%mm3 + por %mm0,%mm7 # mm7 = first set of survivors + por %mm3,%mm2 # mm2 = second set of survivors + + # live registers 2 4 5 7 + # interleave & store decisions in mm4, mm5 + # interleave & store new branch metrics in mm2, mm7 + movq %mm4,%mm3 + movq %mm7,%mm0 + punpckhbw %mm5,%mm4 + punpcklbw %mm5,%mm3 + punpcklbw %mm2,%mm7 # interleave second 8 new metrics + punpckhbw %mm2,%mm0 # interleave first 8 new metrics + movq %mm4,(16*\GROUP+8)(%edx) + movq %mm3,(16*\GROUP)(%edx) + movq %mm7,(16*\GROUP)(%edi) + movq %mm0,(16*\GROUP+8)(%edi) + + .endm + +# invoke macro 4 times for a total of 32 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + butterfly GROUP=2 + butterfly GROUP=3 + + addl $64,%edx # bump decision pointer + + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 8 +fifteens: + .byte 15,15,15,15,15,15,15,15 + + .align 8 +ones: .byte 1,1,1,1,1,1,1,1 diff --git a/libfec/mmxbfly29.s b/libfec/mmxbfly29.s new file mode 100644 index 0000000..e37cab8 --- /dev/null +++ b/libfec/mmxbfly29.s @@ -0,0 +1,161 @@ +/* Intel SIMD MMX implementation of Viterbi ACS butterflies + for 256-state (k=9) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi29_blk_mmx(struct v29 *vp,unsigned char *syms,int nbits); +*/ + + # These are offsets into struct v29, defined in viterbi29.h + .set DP,512 + .set OLDMETRICS,516 + .set NEWMETRICS,520 + .text + .global update_viterbi29_blk_mmx,Mettab29_1,Mettab29_2 + .type update_viterbi29_blk_mmx,@function + .align 16 + + # MMX (64-bit SIMD) version + # requires Pentium-MMX, Pentium-II or better + +update_viterbi29_blk_mmx: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + movl 12(%ebp),%ebx # ebx = syms + movw (%ebx),%ax # ax = second symbol : first symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + movb %ah,%bl + andl $255,%eax + andl $255,%ebx + + # shift into first array index dimension slot + shll $7,%eax + shll $7,%ebx + + # each invocation of this macro will do 8 butterflies in parallel + .MACRO butterfly GROUP + # Compute branch metrics + movq (Mettab29_1+8*\GROUP)(%eax),%mm3 + movq fifteens,%mm0 + paddb (Mettab29_2+8*\GROUP)(%ebx),%mm3 + paddb ones,%mm3 # emulate pavgb - this may not be necessary + psrlq $1,%mm3 + pand %mm0,%mm3 + + movq (8*\GROUP)(%esi),%mm6 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+128)(%esi),%mm2 # Incoming path metric, high bit = 1 + movq %mm6,%mm1 + movq %mm2,%mm7 + + paddb %mm3,%mm6 + paddb %mm3,%mm2 + pxor %mm0,%mm3 # invert branch metric + paddb %mm3,%mm7 # path metric for inverted symbols + paddb %mm3,%mm1 + + # live registers 1 2 6 7 + # Compare mm6 and mm7; mm1 and mm2 + pxor %mm3,%mm3 + movq %mm6,%mm4 + movq %mm1,%mm5 + psubb %mm7,%mm4 # mm4 = mm6 - mm7 + psubb %mm2,%mm5 # mm5 = mm1 - mm2 + pcmpgtb %mm3,%mm4 # mm4 = first set of decisions (ff = 1 better) + pcmpgtb %mm3,%mm5 # mm5 = second set of decisions + + # live registers 1 2 4 5 6 7 + # select survivors + movq %mm4,%mm0 + pand %mm4,%mm7 + movq %mm5,%mm3 + pand %mm5,%mm2 + pandn %mm6,%mm0 + pandn %mm1,%mm3 + por %mm0,%mm7 # mm7 = first set of survivors + por %mm3,%mm2 # mm2 = second set of survivors + + # live registers 2 4 5 7 + # interleave & store decisions in mm4, mm5 + # interleave & store new branch metrics in mm2, mm7 + movq %mm4,%mm3 + movq %mm7,%mm0 + punpckhbw %mm5,%mm4 + punpcklbw %mm5,%mm3 + punpcklbw %mm2,%mm7 # interleave second 8 new metrics + punpckhbw %mm2,%mm0 # interleave first 8 new metrics + movq %mm4,(16*\GROUP+8)(%edx) + movq %mm3,(16*\GROUP)(%edx) + movq %mm7,(16*\GROUP)(%edi) + movq %mm0,(16*\GROUP+8)(%edi) + + .endm + +# invoke macro 16 times for a total of 128 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + butterfly GROUP=2 + butterfly GROUP=3 + butterfly GROUP=4 + butterfly GROUP=5 + butterfly GROUP=6 + butterfly GROUP=7 + butterfly GROUP=8 + butterfly GROUP=9 + butterfly GROUP=10 + butterfly GROUP=11 + butterfly GROUP=12 + butterfly GROUP=13 + butterfly GROUP=14 + butterfly GROUP=15 + + addl $256,%edx # bump decision pointer + + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 8 +fifteens: + .byte 15,15,15,15,15,15,15,15 + + .align 8 +ones: .byte 1,1,1,1,1,1,1,1 diff --git a/libfec/peak_mmx_assist.s b/libfec/peak_mmx_assist.s new file mode 100644 index 0000000..dae831f --- /dev/null +++ b/libfec/peak_mmx_assist.s @@ -0,0 +1,70 @@ +# MMX assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak value in signed 16-bit input samples +# int peakval_mmx(signed short *in,int cnt); + .global peakval_mmx + .type peakval_mmx,@function + .align 16 +peakval_mmx: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + pushl %ebx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + movq %mm7,%mm6 # copy previous peak + pcmpgtw %mm0,%mm6 # ff == old peak greater + pand %mm6,%mm7 # select old peaks that are greater + pandn %mm0,%mm6 # select new values that are greater + por %mm6,%mm7 + + addl $8,%esi + jmp 1b + +2: movd %mm7,%eax + psrlq $16,%mm7 + andl $0xffff,%eax + + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 3f + movl %edx,%eax +3: + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 4f + movl %edx,%eax +4: + movd %mm7,%edx + andl $0xffff,%edx + cmpl %edx,%eax + jnl 5f + movl %edx,%eax +5: + emms + popl %ebx + popl %ecx + popl %esi + popl %ebp + ret + diff --git a/libfec/peak_sse2_assist.s b/libfec/peak_sse2_assist.s new file mode 100644 index 0000000..1dee3a8 --- /dev/null +++ b/libfec/peak_sse2_assist.s @@ -0,0 +1,51 @@ +# SSE2 assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Public License (GPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse2(signed short *in,int cnt); + .global peakval_sse2 + .type peakval_sse2,@function + .align 16 +peakval_sse2: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %xmm7,%xmm7 # clear peak + +1: subl $8,%ecx + jl 2f + movaps (%esi),%xmm0 + movaps %xmm0,%xmm1 + psraw $15,%xmm1 # xmm1 = 1's if negative, 0's if positive + pxor %xmm1,%xmm0 # complement negatives + psubw %xmm1,%xmm0 # add 1 to negatives + pmaxsw %xmm0,%xmm7 # store peak + + addl $16,%esi + jmp 1b + +2: movaps %xmm7,%xmm0 + psrldq $8,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $32,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $16,%xmm0 + pmaxsw %xmm0,%xmm7 # min value in low word of %xmm7 + + movd %xmm7,%eax + andl $0xffff,%eax + + popl %ecx + popl %esi + popl %ebp + ret diff --git a/libfec/peak_sse_assist.s b/libfec/peak_sse_assist.s new file mode 100644 index 0000000..ea6fce8 --- /dev/null +++ b/libfec/peak_sse_assist.s @@ -0,0 +1,49 @@ +# SSE assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse(signed short *in,int cnt); + .global peakval_sse + .type peakval_sse,@function + .align 16 +peakval_sse: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + pmaxsw %mm0,%mm7 # store peak + + addl $8,%esi + jmp 1b + +2: movq %mm7,%mm0 + psrlq $32,%mm0 + pmaxsw %mm0,%mm7 + movq %mm7,%mm0 + psrlq $16,%mm0 + pmaxsw %mm0,%mm7 # min value in low word of %mm7 + + movd %mm7,%eax + andl $0xffff,%eax + + emms + popl %ecx + popl %esi + popl %ebp + ret diff --git a/libfec/peaktest.c b/libfec/peaktest.c new file mode 100644 index 0000000..fa4b280 --- /dev/null +++ b/libfec/peaktest.c @@ -0,0 +1,38 @@ +/* Verify correctness of the peak routine + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include +#include + +/* These values should trigger leading/trailing array fragment handling */ +#define NSAMP 200002 +#define OFFSET 1 + +int peakval(signed short *,int); +int peakval_port(signed short *,int); + +int main(){ + int i,s; + int result,rresult; + signed short samples[NSAMP]; + + srandom(time(NULL)); + + for(i=0;i +#include "fec.h" + +int peakval_port(signed short *b,int cnt); +#ifdef __i386__ +int peakval_mmx(signed short *b,int cnt); +int peakval_sse(signed short *b,int cnt); +int peakval_sse2(signed short *b,int cnt); +#endif + +#ifdef __x86_64__ +int peakval_sse2(signed short *b,int cnt); +#endif + +#ifdef __VEC__ +int peakval_av(signed short *b,int cnt); +#endif + +int peakval(signed short *b,int cnt){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return peakval_port(b,cnt); +#ifdef __i386__ + case MMX: + return peakval_mmx(b,cnt); + case SSE: + return peakval_sse(b,cnt); + case SSE2: + return peakval_sse2(b,cnt); +#endif + +#ifdef __x86_64__ + case SSE2: + return peakval_port(b,cnt); + //return peakval_sse2(b,cnt); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return peakval_av(b,cnt); +#endif + } +} diff --git a/libfec/peakval_av.c b/libfec/peakval_av.c new file mode 100644 index 0000000..ae54c10 --- /dev/null +++ b/libfec/peakval_av.c @@ -0,0 +1,61 @@ +/* Return the largest absolute value of a vector of signed shorts + + * This is the Altivec SIMD version. + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include "fec.h" + +signed short peakval_av(signed short *in,int cnt){ + vector signed short x; + int pad; + union { vector signed char cv; vector signed short hv; signed short s[8]; signed char c[16];} s; + vector signed short smallest,largest; + + smallest = (vector signed short)(0); + largest = (vector signed short)(0); + if((pad = (int)in & 15)!=0){ + /* Load unaligned leading word */ + x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in)); + if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */ + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + } + smallest = vec_min(smallest,x); + largest = vec_max(largest,x); + in += 8-pad/2; + cnt -= 8-pad/2; + } + /* Everything is now aligned, rip through most of the block */ + while(cnt >= 8){ + x = vec_ld(0,in); + smallest = vec_min(smallest,x); + largest = vec_max(largest,x); + in += 8; + cnt -= 8; + } + /* Handle trailing fragment, if any */ + if(cnt > 0){ + x = vec_ld(0,in); + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + smallest = vec_min(smallest,x); + largest = vec_max(largest,x); + } + /* Combine and extract result */ + largest = vec_max(largest,vec_abs(smallest)); + + s.c[15] = 64; /* Shift right four 16-bit words */ + largest = vec_max(largest,vec_sro(largest,s.cv)); + + s.c[15] = 32; /* Shift right two 16-bit words */ + largest = vec_max(largest,vec_sro(largest,s.cv)); + + s.c[15] = 16; /* Shift right one 16-bit word */ + largest = vec_max(largest,vec_sro(largest,s.cv)); + + s.hv = largest; + return s.s[7]; +} diff --git a/libfec/peakval_mmx.c b/libfec/peakval_mmx.c new file mode 100644 index 0000000..436fe88 --- /dev/null +++ b/libfec/peakval_mmx.c @@ -0,0 +1,34 @@ +/* Wrapper for the MMX version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ + +#include + +int peakval_mmx_assist(signed short *,int); + +int peakval_mmx(signed short *b,int cnt){ + int peak = 0; + int a; + + while(((int)b & 7) != 0 && cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + a = peakval_mmx_assist(b,cnt); + if(a > peak) + peak = a; + b += cnt & ~3; + cnt &= 3; + + while(cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + return peak; +} diff --git a/libfec/peakval_mmx_assist.s b/libfec/peakval_mmx_assist.s new file mode 100644 index 0000000..553cb79 --- /dev/null +++ b/libfec/peakval_mmx_assist.s @@ -0,0 +1,70 @@ +# MMX assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak value in signed 16-bit input samples +# int peakval_mmx_assist(signed short *in,int cnt); + .global peakval_mmx_assist + .type peakval_mmx_assist,@function + .align 16 +peakval_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + pushl %ebx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + movq %mm7,%mm6 # copy previous peak + pcmpgtw %mm0,%mm6 # ff == old peak greater + pand %mm6,%mm7 # select old peaks that are greater + pandn %mm0,%mm6 # select new values that are greater + por %mm6,%mm7 + + addl $8,%esi + jmp 1b + +2: movd %mm7,%eax + psrlq $16,%mm7 + andl $0xffff,%eax + + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 3f + movl %edx,%eax +3: + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 4f + movl %edx,%eax +4: + movd %mm7,%edx + andl $0xffff,%edx + cmpl %edx,%eax + jnl 5f + movl %edx,%eax +5: + emms + popl %ebx + popl %ecx + popl %esi + popl %ebp + ret + diff --git a/libfec/peakval_port.c b/libfec/peakval_port.c new file mode 100644 index 0000000..07ab316 --- /dev/null +++ b/libfec/peakval_port.c @@ -0,0 +1,16 @@ +/* Portable C version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include "fec.h" +int peakval_port(signed short *b,int len){ + int peak = 0; + int a,i; + + for(i=0;i peak) + peak = a; + } + return peak; +} diff --git a/libfec/peakval_sse.c b/libfec/peakval_sse.c new file mode 100644 index 0000000..9868b7f --- /dev/null +++ b/libfec/peakval_sse.c @@ -0,0 +1,35 @@ +/* IA-32 SSE version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ + +#include +#include "fec.h" + +int peakval_sse_assist(signed short *,int); + +int peakval_sse(signed short *b,int cnt){ + int peak = 0; + int a; + + while(((int)b & 7) != 0 && cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + a = peakval_sse_assist(b,cnt); + if(a > peak) + peak = a; + b += cnt & ~3; + cnt &= 3; + + while(cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + return peak; +} diff --git a/libfec/peakval_sse2.c b/libfec/peakval_sse2.c new file mode 100644 index 0000000..79d9059 --- /dev/null +++ b/libfec/peakval_sse2.c @@ -0,0 +1,34 @@ +/* Portable C version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include "fec.h" + +int peakval_sse2_assist(signed short *,int); + +int peakval_sse2(signed short *b,int cnt){ + int peak = 0; + int a; + + while(((int)b & 15) != 0 && cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + a = peakval_sse2_assist(b,cnt); + if(a > peak) + peak = a; + b += cnt & ~7; + cnt &= 7; + + while(cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + return peak; +} diff --git a/libfec/peakval_sse2_assist.s b/libfec/peakval_sse2_assist.s new file mode 100644 index 0000000..c7a58e7 --- /dev/null +++ b/libfec/peakval_sse2_assist.s @@ -0,0 +1,51 @@ +# SSE2 assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse2_assist(signed short *in,int cnt); + .global peakval_sse2_assist + .type peakval_sse2_assist,@function + .align 16 +peakval_sse2_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %xmm7,%xmm7 # clear peak + +1: subl $8,%ecx + jl 2f + movaps (%esi),%xmm0 + movaps %xmm0,%xmm1 + psraw $15,%xmm1 # xmm1 = 1's if negative, 0's if positive + pxor %xmm1,%xmm0 # complement negatives + psubw %xmm1,%xmm0 # add 1 to negatives + pmaxsw %xmm0,%xmm7 # store peak + + addl $16,%esi + jmp 1b + +2: movaps %xmm7,%xmm0 + psrldq $8,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $32,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $16,%xmm0 + pmaxsw %xmm0,%xmm7 # min value in low word of %xmm7 + + movd %xmm7,%eax + andl $0xffff,%eax + + popl %ecx + popl %esi + popl %ebp + ret diff --git a/libfec/peakval_sse_assist.s b/libfec/peakval_sse_assist.s new file mode 100644 index 0000000..827c800 --- /dev/null +++ b/libfec/peakval_sse_assist.s @@ -0,0 +1,49 @@ +# SSE assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse_assist(signed short *in,int cnt); + .global peakval_sse_assist + .type peakval_sse_assist,@function + .align 16 +peakval_sse_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + pmaxsw %mm0,%mm7 # store peak + + addl $8,%esi + jmp 1b + +2: movq %mm7,%mm0 + psrlq $32,%mm0 + pmaxsw %mm0,%mm7 + movq %mm7,%mm0 + psrlq $16,%mm0 + pmaxsw %mm0,%mm7 # min value in low word of %mm7 + + movd %mm7,%eax + andl $0xffff,%eax + + emms + popl %ecx + popl %esi + popl %ebp + ret diff --git a/libfec/rs-common.h b/libfec/rs-common.h new file mode 100644 index 0000000..e64eb39 --- /dev/null +++ b/libfec/rs-common.h @@ -0,0 +1,26 @@ +/* Stuff common to all the general-purpose Reed-Solomon codecs + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +/* Reed-Solomon codec control block */ +struct rs { + int mm; /* Bits per symbol */ + int nn; /* Symbols per block (= (1<= rs->nn) { + x -= rs->nn; + x = (x >> rs->mm) + (x & rs->nn); + } + return x; +} diff --git a/libfec/rs.3 b/libfec/rs.3 new file mode 100644 index 0000000..5d71503 --- /dev/null +++ b/libfec/rs.3 @@ -0,0 +1,198 @@ +.TH REED-SOLOMON 3 +.SH NAME +init_rs_int, encode_rs_int, decode_rs_int, free_rs_int, +init_rs_char, encode_rs_char, decode_rs_char, free_rs_char, +encode_rs_8, decode_rs_8, encode_rs_ccsds, decode_rs_ccsds +\- Reed-Solomon encoding/decoding +.SH SYNOPSIS +.nf +.ft B +#include "fec.h" + +void *init_rs_int(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad); + +void encode_rs_int(void *rs,int *data,int *parity); + +int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras); + +void free_rs_int(void *rs); + + +void *init_rs_char(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad); + +void encode_rs_char(void *rs,unsigned char *data, + unsigned char *parity); + +int decode_rs_char(void *rs,unsigned char *data,int *eras_pos, + int no_eras); + +void free_rs_char(void *rs); + + +void encode_rs_8(unsigned char *data,unsigned char *parity, + int pad); + +int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras, + int pad); + + +void encode_rs_ccsds(unsigned char *data,unsigned char *parity, + int pad); + +int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras, + int pad); + +unsigned char Taltab[256]; +unsigned char Tal1tab[256]; + +.fi + +.SH DESCRIPTION +These functions implement Reed-Solomon error control encoding and +decoding. For optimal performance in a variety of applications, three +sets of functions are supplied. To access these functions, add "-lfec" +to your linker command line. + +The functions with names ending in \fB_int\fR handle data in integer arrays, +permitting arbitrarily large codewords limited only by machine +resources. + +The functions with names ending in \fB_char\fR take unsigned char arrays and can +handle codes with symbols of 8 bits or less (i.e., with codewords of +255 symbols or less). + +\fBencode_rs_8\fR and \fBdecode_rs_8\fR implement a specific +(255,223) code with 8-bit symbols specified by the CCSDS: +a field generator of 1 + X + X^2 + X^7 + X^8 and a code +generator with first consecutive root = 112 and a primitive element of +11. These functions use the conventional +polynomial form, \fInot\fR the dual-basis specified in +the CCSDS standard, to represent symbols. This code may be +shortened by giving a non-zero \fBpad\fR value to produce a +(255-\fBpad\fR,223-\fBpad\fR) code. The padding will consist of the +specified number of zeroes at the front of the full codeword. + +For full CCSDS compatibility, \fBencode_rs_ccsds\fR and +\fBdecode_rs_ccsds\fR are provided. These functions use two lookup +tables, \fBTaltab\fR to convert from conventional to dual-basis, and +\fBTal1tab\fR to perform the inverse mapping from dual-basis to +conventional form, before and after calls to \fBencode_rs_8\fR +and \fBdecode_rs_8\fR. + +The \fB_8\fR and \fB_ccsds\fR functions do not require initialization. + +To use the general purpose RS encoder or decoder (i.e., +the \fB_char\fR or \fB_int\fR versions), the user must first +call \fBinit_rs_int\fR or \fBinit_rs_char\fR as appropriate. The +arguments are as follows: + +\fBsymsize\fR gives the symbol size in bits, up to 8 for \fBinit_rs_char\fR +or 32 for \fBinit_rs_int\fR on a machine with 32-bit ints (though such a +huge code would exhaust memory limits on a 32-bit machine). The resulting +Reed-Solomon code word will have 2^\fBsymsize\fR - 1 symbols, +each containing \fBsymsize\fR bits. The codeword may be shortened with the +\fBpad\fR parameter described below. + +\fBgfpoly\fR gives the extended Galois field generator polynomial coefficients, +with the 0th coefficient in the low order bit. The polynomial +\fImust\fR be primitive; if not, the call will fail and NULL will be +returned. + +\fBfcr\fR gives, in index form, the first consecutive root of the +Reed Solomon code generator polynomial. + +\fBprim\fR gives, in index form, the primitive element in the Galois field +used to generate the Reed Solomon code generator polynomial. + +\fBnroots\fR gives the number of roots in the Reed Solomon code +generator polynomial. This equals the number of parity symbols +per code block. + +\fBpad\fR gives the number of leading symbols in the codeword +that are implicitly padded to zero in a shortened code block. + +The resulting Reed-Solomon code has parameters (N,K), where +N = 2^\fBsymsize\fR - \fBpad\fR - 1 and K = N-\fBnroots\fR. + +The \fBencode_rs_char\fR and \fBencode_rs_int\fR functions accept +the pointer returned by \fBinit_rs_char\fR or +\fBinit_rs_int\fR, respectively, to +encode a block of data using the specified code. +The input data array is expected to +contain K symbols (of \fBsymsize\fR bits each, right justified +in each char or int) and \fBnroots\fR parity symbols will be placed +into the \fBparity\fR array, right justified. + +The \fBdecode_\fR functions correct +the errors in a Reed-Solomon codeword of N symbols up to the capability of the code. +An optional list of "erased" symbol indices may be given in the \fBeras_pos\fR +array to assist the decoder; this parameter may be NULL if no erasures +are given. The number of erased symbols must be given in the \fBno_eras\fR +parameter. + +To maximize performance, the encode and decode functions perform no +"sanity checking" of their inputs. Decoder failure may result if +\fBeras_pos\fR contains duplicate entries, and both encoder and +decoder will fail if an input symbol exceeds its allowable range. +(Symbol range overflow cannot occur with the \fB_8\fR or +\fB_ccsds\fR functions, +or with the \fB_char\fR functions when 8-bit symbols are specified.) + +The decoder corrects the symbols "in place", returning the number +of symbols in error. If the codeword is uncorrectable, -1 is returned +and the data block is unchanged. If \fBeras_pos\fR is non-null, it is +used to return a list of corrected symbol positions, in no particular +order. This means that the +array passed through this parameter \fImust\fR have at least \fBnroots\fR +elements to prevent a possible buffer overflow. + +The \fBfree_rs_int\fR and \fBfree_rs_char\fR functions free the internal +space allocated by the \fBinit_rs_int\fR and \fBinit_rs_char\fR functions, +respecitively. + +The functions \fBencode_rs_8\fR and \fBdecode_rs_8\fR do not have +corresponding \fBinit\fR and \fBfree\fR, nor do they take the +\fBrs\fR argument accepted by the other functions as their parameters +are statically compiled. These functions implement a code +equivalent to calling + +\fBinit_rs_char\fR(8,0x187,112,11,32,pad); + +and using the resulting pointer with \fBencode_rs_char\fR and +\fBdecode_rs_char\fR. + +.SH RETURN VALUES +\fBinit_rs_int\fR and \fBinit_rs_char\fR return a pointer to an internal +control structure that must be passed to the corresponding encode, decode +and free functions. These functions return NULL on error. + +The \fBdecode_\fR functions return a count of corrected +symbols, or -1 if the block was uncorrectible. + +.SH AUTHOR +Phil Karn, KA9Q (karn@ka9q.net), based heavily on earlier work by Robert +Morelos-Zaragoza (robert@spectra.eng.hawaii.edu) and Hari Thirumoorthy +(harit@spectra.eng.hawaii.edu). Extra improvements suggested by Detmar +Welz (dwelz@web.de). + +.SH COPYRIGHT +Copyright 2004, Phil Karn, KA9Q. May be used under the terms of the +GNU Lesser General Public License (LGPL). + +.SH SEE ALSO +CCSDS 101.0-B-6: Telemetry Channel Coding. +http://www.ccsds.org/documents/101x0b6.pdf + +.SH NOTE +CCSDS chose the "dual basis" symbol representation because it +simplified the implementation of a Reed-Solomon encoder in dedicated +hardware. However, this approach holds no advantages for a software +implementation on a general purpose computer, so use of the dual basis +is recommended only if compatibility with the CCSDS standard is needed, +e.g., to decode data from an existing spacecraft using the CCSDS +standard. If you just want a fast (255,223) RS codec without needing +to interoperate with a CCSDS standard code, use \fBencode_rs_8\fR +and \fBdecode_rs_8\fR. + diff --git a/libfec/rs_speedtest.c b/libfec/rs_speedtest.c new file mode 100644 index 0000000..225f160 --- /dev/null +++ b/libfec/rs_speedtest.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include +#include +#include "fec.h" + +int main(){ + unsigned char block[255]; + int i; + void *rs; + struct rusage start,finish; + double extime; + int trials = 10000; + + for(i=0;i<223;i++) + block[i] = 0x01; + + rs = init_rs_char(8,0x187,112,11,32,0); + encode_rs_char(rs,block,&block[223]); + + getrusage(RUSAGE_SELF,&start); + for(i=0;i +#include +#include +#include +#include "fec.h" + + +struct etab { + int symsize; + int genpoly; + int fcs; + int prim; + int nroots; + int ntrials; +} Tab[] = { + {2, 0x7, 1, 1, 1, 10 }, + {3, 0xb, 1, 1, 2, 10 }, + {4, 0x13, 1, 1, 4, 10 }, + {5, 0x25, 1, 1, 6, 10 }, + {6, 0x43, 1, 1, 8, 10 }, + {7, 0x89, 1, 1, 10, 10 }, + {8, 0x11d, 1, 1, 32, 10 }, + {8, 0x187, 112,11, 32, 10 }, /* Duplicates CCSDS codec */ + {9, 0x211, 1, 1, 32, 10 }, + {10,0x409, 1, 1, 32, 10 }, + {11,0x805, 1, 1, 32, 10 }, + {12,0x1053, 1, 1, 32, 5 }, + {13,0x201b, 1, 1, 32, 2 }, + {14,0x4443, 1, 1, 32, 1 }, + {15,0x8003, 1, 1, 32, 1 }, + {16,0x1100b, 1, 1, 32, 1 }, + {0, 0, 0, 0, 0}, +}; + +int exercise_char(struct etab *e); +int exercise_int(struct etab *e); +int exercise_8(void); + +int main(){ + int i; + + srandom(time(NULL)); + + printf("Testing fixed CCSDS encoder...\n"); + exercise_8(); + for(i=0;Tab[i].symsize != 0;i++){ + int nn,kk; + + nn = (1<symsize) - 1; + unsigned char block[nn],tblock[nn]; + int errlocs[nn],derrlocs[nn]; + int i; + int errors; + int derrors,kk; + int errval,errloc; + int erasures; + int decoder_errors = 0; + void *rs; + + if(e->symsize > 8) + return -1; + + /* Compute code parameters */ + kk = nn - e->nroots; + + rs = init_rs_char(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0); + if(rs == NULL){ + printf("init_rs_char failed!\n"); + return -1; + } + /* Test up to the error correction capacity of the code */ + for(errors=0;errors <= e->nroots/2;errors++){ + + /* Load block with random data and encode */ + for(i=0;isymsize) - 1; + int block[nn],tblock[nn]; + int errlocs[nn],derrlocs[nn]; + int i; + int errors; + int derrors,kk; + int errval,errloc; + int erasures; + int decoder_errors = 0; + void *rs; + + /* Compute code parameters */ + kk = nn - e->nroots; + + rs = init_rs_int(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0); + if(rs == NULL){ + printf("init_rs_int failed!\n"); + return -1; + } + /* Test up to the error correction capacity of the code */ + for(errors=0;errors <= e->nroots/2;errors++){ + + /* Load block with random data and encode */ + for(i=0;i +#include +#include "fec.h" + +#define MAX_RANDOM 0x7fffffff + +/* Generate gaussian random double with specified mean and std_dev */ +double normal_rand(double mean, double std_dev) +{ + double fac,rsq,v1,v2; + static double gset; + static int iset; + + if(iset){ + /* Already got one */ + iset = 0; + return mean + std_dev*gset; + } + /* Generate two evenly distributed numbers between -1 and +1 + * that are inside the unit circle + */ + do { + v1 = 2.0 * (double)random() / MAX_RANDOM - 1; + v2 = 2.0 * (double)random() / MAX_RANDOM - 1; + rsq = v1*v1 + v2*v2; + } while(rsq >= 1.0 || rsq == 0.0); + fac = sqrt(-2.0*log(rsq)/rsq); + gset = v1*fac; + iset++; + return mean + std_dev*v2*fac; +} + +unsigned char addnoise(int sym,double amp,double gain,double offset,int clip){ + int sample; + + sample = offset + gain*normal_rand(sym?amp:-amp,1.0); + /* Clip to 8-bit offset range */ + if(sample < 0) + sample = 0; + else if(sample > clip) + sample = clip; + return sample; +} diff --git a/libfec/simd-viterbi.3 b/libfec/simd-viterbi.3 new file mode 100644 index 0000000..4c67593 --- /dev/null +++ b/libfec/simd-viterbi.3 @@ -0,0 +1,247 @@ +.TH SIMD-VITERBI 3 +.SH NAME +create_viterbi27, set_viterbi27_polynomial, init_viterbi27, update_viterbi27_blk, +chainback_viterbi27, delete_viterbi27, +create_viterbi29, set_viterbi_29_polynomial, init_viterbi29, update_viterbi29_blk, +chainback_viterbi29, delete_viterbi29, +create_viterbi39, set_viterbi_39_polynomial, init_viterbi39, update_viterbi39_blk, +chainback_viterbi39, delete_viterbi39, +create_viterbi615, set_viterbi615_polynomial, init_viterbi615, update_viterbi615_blk, +chainback_viterbi615, delete_viterbi615 -\ IA32 SIMD-assisted Viterbi decoders +.SH SYNOPSIS +.nf +.ft B +#include "fec.h" +void *create_viterbi27(int blocklen); +void set_viterbi27_polynomial(int polys[2]); +int init_viterbi27(void *vp,int starting_state); +int update_viterbi27_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27(void *vp); +.fi +.sp +.nf +.ft B +void *create_viterbi29(int blocklen); +void set_viterbi29_polynomial(int polys[2]); +int init_viterbi29(void *vp,int starting_state); +int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29(void *vp); +.fi +.sp +.nf +.ft B +void *create_viterbi39(int blocklen); +void set_viterbi39_polynomial(int polys[3]); +int init_viterbi39(void *vp,int starting_state); +int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39(void *vp); +.fi +.sp +.nf +.ft B +void *create_viterbi615(int blocklen); +void set_viterbi615_polynomial(int polys[6]); +int init_viterbi615(void *vp,int starting_state); +int update_viterbi615_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615(void *vp); +.fi +.SH DESCRIPTION +These functions implement high performance Viterbi decoders for four +convolutional codes: a rate 1/2 constraint length 7 (k=7) code +("viterbi27"), a rate 1/2 k=9 code ("viterbi29"), +a rate 1/3 k=9 code ("viterbi39") and a rate 1/6 k=15 code ("viterbi615"). +The decoders use the Intel IA32 or PowerPC SIMD instruction sets, if available, to improve +decoding speed. + +On the IA32 there are three different SIMD instruction sets. The first +and most common is MMX, introduced on later Intel Pentiums and then on +the Intel Pentium II and most Intel clones (AMD K6, Transmeta Crusoe, +etc). SSE was introduced on the Pentium III and later implemented in +the AMD Athlon 4 (AMD calls it "3D Now! Professional"). Most +recently, SSE2 was introduced in the Intel Pentium 4, and has been +adopted by more recent AMD CPUs. The presence of SSE2 implies the +existence of SSE, which in turn implies MMX. + +Altivec is the PowerPC SIMD instruction set. It is roughly comparable +to SSE2. Altivec was introduced to the general public in the Apple +Macintosh G4; it is also present in the G5. Altivec is actually a +Motorola trademark; Apple calls it "Velocity Engine" and IBM calls it +"VMX". All refer to the same thing. + +When built for the IA32 or PPC architectures, the functions +automatically use the most powerful SIMD instruction set available. If +no SIMD instructions are available, or if the library is built for a +non-IA32, non-PPC machine, a portable C version is executed +instead. + +.SH USAGE +Four versions of each function are provided, one for each code. +In the following discussion, change "viterbi" to "viterbi27", "viterbi29", "viterbi39" +or "viterbi615" as desired. + +Before Viterbi decoding can begin, an instance must first be created with +\fBcreate_viterbi()\fR. This function creates and returns a pointer to +an internal control structure +containing the path metrics and the branch +decisions. \fBcreate_viterbi()\fR takes one argument that gives the +length of the data block in bits. You \fImust not\fR attempt to +decode a block longer than the length given to \fBcreate_viterbi()\fR. + +Before decoding a new frame, +\fBinit_viterbi()\fR must be called to reset the decoder state. +It accepts the instance pointer returned by +\fBcreate_viterbi()\fR and the initial starting state of the +convolutional encoder (usually 0). If the initial starting state is unknown or +incorrect, the decoder will still function but the decoded data may be +incorrect at the start of the block. + +Blocks of received symbols are processed with calls to +\fBupdate_viterbi_blk()\fR. The \fBnbits\fR parameter specifies the +number of \fIdata bits\fR (not channel symbols) represented by the +\fBsyms\fR buffer. (For rate 1/2 codes, the number of symbols in +\fBsyms\fR is twice \fInbits\fR, and so on.) +Each symbol is expected to range +from 0 through 255, with 0 corresponding to a "strong 0" and 255 +corresponding to a "strong 1". The caller is responsible for +determining the proper pairing of input symbols (commonly known as +decoder symbol phasing). + +At the end of the block, the data is recovered with a call to +\fBchainback_viterbi()\fR. The arguments are the pointer to the +decoder instance, a pointer to a user-supplied buffer into which the +decoded data is to be written, the number of data bits (not bytes) +that are to be decoded, and the terminal state of the convolutional +encoder at the end of the frame (usually 0). If the terminal state is +incorrect or unknown, the decoded data bits at the end of the frame +may be unreliable. The decoded data is written in big-endian order, +i.e., the first bit in the frame is written into the high order bit of +the first byte in the buffer. If the frame is not an integral number +of bytes long, the low order bits of the last byte in the frame will +be unused. + +Note that the decoders assume the use of a tail, i.e., the encoding +and transmission of a sufficient number of padding bits beyond the end +of the user data to force the convolutional encoder into the known +terminal state given to \fBchainback_viterbi()\fR. The tail is +always one bit less than the constraint length of the code, so the k=7 +code uses 6 tail bits (12 tail symbols), the k=9 code uses 8 tail bits +(16 tail symbols) and the k=15 code uses 14 tail bits (84 tail +symbols). + +The tail bits are not included in the length arguments to +\fBcreate_viterbi()\fR and \fBchainback_viterbi()\fR. For example, if +the block contains 1000 user bits, then this would be the length +parameter given to \fBcreate_viterbi27()\fR and +\fBchainback_viterbi27()\fR, and \fBupdate_viterbi27_blk()\fR would be called +with a total of 2012 symbols - the last 12 encoded symbols +representing the tail bits. + +After the call to \fBchainback_viterbi()\fR, the decoder may be reset +with a call to \fBinit_viterbi()\fR and another block can be decoded. +Alternatively, \fBdelete_viterbi()\fR can be called to free all resources +used by the Viterbi decoder. + +The \fBset_viterbi_polynomial()\fR function allows use of other than the default +code generator polynomials. Although only one set of polynomials are generally +used with each code, there can are different conventions as to their order and +symbol polarity, and these functions simplifies their use. + +The default polynomials for the viterbi27 routes +are those of the NASA-JPL convention \fIwithout\fR symbol inversion. +The NASA-JPL convention normally inverts the first symbol. +The CCSDS/NASA-GSFC convention swaps the two symbols and inverts the second. +.sp +To set the NASA-JPL convention with symbol inversion: +.sp +.nf +.ft B +int polys[2] = { -V27POLYA,V27POLYB }; +set_viterbi27_polynomial(polys); +.ft R +.fi +.sp +and to set the CCSDS convention with symbol inversion: +.sp +.nf +.ft B +int polys[2] = { V27POLYB,-V27POLYA }; +set_viterbi27_polynomial(polys); +.ft R +.fi +.sp +The default polynomials for the viterbi615 routines +are those used by the Cassini spacecraft \fIwithout\fR +symbol inversion. Mars Pathfinder (MPF) and STEREO +swap the third and fourth polynomials. +Both conventions invert the +first, third and fifth symbols. Refer to fec.h for the polynomial constant definitions. +.sp +To set the Cassini convention with symbol inversion, do the following: + +.nf +.ft B +int polys[6] = { -V615POLYA,V615POLYB,-V615POLYC,V615POLYD,-V615POLYE,V615POLYF }; +set_viterbi615_polynomial(polys); +.ft R +.fi +.sp +and to set the MPF/STEREO convention with symbol inversion: +.sp +.nf +.ft B +int polys[6] = { -V615POLYA,V615POLYB,-V615POLYD,V615POLYC,-V615POLYE,V615POLYF }; +set_viterbi615_polynomial(polys); +.ft R +.fi + +For performance reasons, calling this function changes the code +generator polynomials for \fIall\fR instances of corresponding Viterbi decoder, +including those already created. + +.SH ERROR PERFORMANCE +These decoders have all been extensively tested and found to provide +performance consistent with that expected for soft-decision Viterbi +decoding with 8-bit symbols. + +Due to internal differences, the implementations +vary slightly in error performance. In +general, the portable C versions exhibit the best error performance +because they use full-sized branch metrics, and the MMX versions +exhibit the worst because they use 8-bit branch metrics with modulo +comparisons. The SSE, SSE2 and Altivec implementations of the r=1/2 k=7 and +r=1/2 k=9 codes use unsigned +8-bit branch metrics, and are almost as good as the C versions. The +r=1/3 k=9 and r=1/6 k=15 codes are implemented with 16-bit path metrics in all SIMD +versions. + +.SH DIRECT ACCESS TO SPECIFIC FUNCTION VERSIONS +Calling the functions listed above automatically calls the appropriate +version of the function depending on the CPU type and available SIMD +instructions. A particular version can also be called directly by +appending the appropriate suffix to the function name. The available +suffixes are "_mmx", "_sse", "_sse2", "_av" and "_port", for the MMX, +SSE, SSE2, Altivec and portable versions, respectively. For example, +the SSE2 version of the update_viterbi27_blk() function can be invoked +as update_viterbi27_blk_sse2(). + +Naturally, the _av functions are only available on the PowerPC and the +_mmx, _sse and _sse2 versions are only available on IA-32. Calling +a SIMD-enabled function on a CPU that doesn't support the appropriate +set of instructions will result in an illegal instruction exception. + +.SH RETURN VALUES +\fBcreate_viterbi\fR returns a pointer to the structure containing +the decoder state. +The other functions return -1 on error, 0 otherwise. + +.SH AUTHOR & COPYRIGHT +Phil Karn, KA9Q (karn@ka9q.net) + +.SH LICENSE +This software may be used under the terms of the GNU Limited General Public License (LGPL). + + diff --git a/libfec/sqtest.c b/libfec/sqtest.c new file mode 100644 index 0000000..b2abb09 --- /dev/null +++ b/libfec/sqtest.c @@ -0,0 +1,42 @@ +/* Verify correctness of the sum-of-square routines */ +#include +#include +#include + +/* These values should trigger leading/trailing array fragment handling */ +#define NSAMP 200002 +#define OFFSET 1 + +long long sumsq_wq(signed short *in,int cnt); +long long sumsq_wq_ref(signed short *in,int cnt); + +int main(){ + int i; + long long result,rresult; + signed short samples[NSAMP]; + + srandom(time(NULL)); + + for(i=0;i old metrics + movq NEWMETRICS(%rdx),%rdi # edi -> new metrics + movq DP(%rdx),%rdx # edx -> decisions + +1: movq 16(%rbp),%rax # eax = nbits + decq %rax + jl 2f # passed zero, we're done + movq %rax,16(%rbp) + + xorq %rax,%rax + movq 12(%rbp),%rbx # ebx = syms + movb (%rbx),%al + movd %rax,%xmm6 # xmm6[0] = first symbol + movb 1(%rbx),%al + movd %rax,%xmm5 # xmm5[0] = second symbol + addq $2,%rbx + movq %rbx,12(%rbp) + + punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] + punpcklbw %xmm5,%xmm5 + pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 + pshuflw $0,%xmm5,%xmm5 + punpcklqdq %xmm6,%xmm6 # propagate to all 16 + punpcklqdq %xmm5,%xmm5 + # xmm6 now contains first symbol in each byte, xmm5 the second + + movdqa thirtyones(%rip),%xmm7 + + # each invocation of this macro does 16 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movdqa (Branchtab27_sse2+(16*\GROUP))(%rip),%xmm4 + movdqa (Branchtab27_sse2+32+(16*\GROUP))(%rip),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + + # compute 5-bit branch metric in xmm4 by adding the individual symbol metrics + # This is okay for this + # code because the worst-case metric spread (at high Eb/No) is only 120, + # well within the range of our unsigned 8-bit path metrics, and even within + # the range of signed 8-bit path metrics + pavgb %xmm3,%xmm4 + psrlw $3,%xmm4 + + pand %xmm7,%xmm4 + + movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 + movdqa ((16*\GROUP)+32)(%esi),%xmm3 # Incoming path metric, high bit = 1 + movdqa %xmm0,%xmm2 + movdqa %xmm3,%xmm1 + paddusb %xmm4,%xmm0 # note use of saturating arithmetic + paddusb %xmm4,%xmm3 # this shouldn't be necessary, but why not? + + # negate branch metrics + pxor %xmm7,%xmm4 + paddusb %xmm4,%xmm1 + paddusb %xmm4,%xmm2 + + # Find survivors, leave in mm0,2 + pminub %xmm1,%xmm0 + pminub %xmm3,%xmm2 + # get decisions, leave in mm1,3 + pcmpeqb %xmm0,%xmm1 + pcmpeqb %xmm2,%xmm3 + + # interleave and store new branch metrics in mm0,2 + movdqa %xmm0,%xmm4 + punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics + punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics + movdqa %xmm0,(32*\GROUP+16)(%rdi) + movdqa %xmm4,(32*\GROUP)(%rdi) + + # interleave decisions & store + movdqa %xmm1,%xmm4 + punpckhbw %xmm3,%xmm1 + punpcklbw %xmm3,%xmm4 + # work around bug in gas due to Intel doc error + .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx + shlq $16,%rbx + .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax + orq %rax,%rbx + movq %rbx,(4*\GROUP)(%rdx) + .endm + + # invoke macro 2 times for a total of 32 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + + addq $8,%rdx # bump decision pointer + + # See if we have to normalize. This requires an explanation. We don't want + # our path metrics to exceed 255 on the *next* iteration. Since the + # largest branch metric is 30, that means we don't want any to exceed 225 + # on *this* iteration. Rather than look them all, we just pick an arbitrary one + # (the first) and see if it exceeds 225-120=105, where 120 is the experimentally- + # determined worst-case metric spread for this code and branch metrics in the range 0-30. + + # This is extremely conservative, and empirical testing at a variety of Eb/Nos might + # show that a higher threshold could be used without affecting BER performance + movq (%rdi),%rax # extract first output metric + andq $255,%rax + cmp $105,%rax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics. We can't just pick an arbitrary small constant because + # the minimum metric might be zero! + movdqa (%rdi),%xmm0 + movdqa %xmm0,%xmm4 + movdqa 16(%rdi),%xmm1 + pminub %xmm1,%xmm4 + movdqa 32(%rdi),%xmm2 + pminub %xmm2,%xmm4 + movdqa 48(%rdi),%xmm3 + pminub %xmm3,%xmm4 + + # crunch down to single lowest metric + movdqa %xmm4,%xmm5 + psrldq $8,%xmm5 # the count to psrldq is bytes, not bits! + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $32,%xmm5 + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $16,%xmm5 + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $8,%xmm5 + pminub %xmm5,%xmm4 # now in lowest byte of %xmm4 + + punpcklbw %xmm4,%xmm4 # lowest 2 bytes + pshuflw $0,%xmm4,%xmm4 # lowest 8 bytes + punpcklqdq %xmm4,%xmm4 # all 16 bytes + + # xmm4 now contains lowest metric in all 16 bytes + # subtract it from every output metric + psubusb %xmm4,%xmm0 + psubusb %xmm4,%xmm1 + psubusb %xmm4,%xmm2 + psubusb %xmm4,%xmm3 + movdqa %xmm0,(%rdi) + movdqa %xmm1,16(%rdi) + movdqa %xmm2,32(%rdi) + movdqa %xmm3,48(%rdi) + +done: + # swap metrics + movq %rsi,%rax + movq %rdi,%rsi + movq %rax,%rdi + jmp 1b + +2: movq 8(%rbp),%rbx # ebx = vp + # stash metric pointers + movq %rsi,OLDMETRICS(%rbx) + movq %rdi,NEWMETRICS(%rbx) + movq %rdx,DP(%rbx) # stash incremented value of vp->dp + xorq %rax,%rax +err: popq %rbx + popq %rdx + popq %rdi + popq %rsi + popq %rbp + ret + + .data + .align 16 + +thirtyones: + .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 diff --git a/libfec/sse2bfly27.s b/libfec/sse2bfly27.s new file mode 100644 index 0000000..27422a2 --- /dev/null +++ b/libfec/sse2bfly27.s @@ -0,0 +1,202 @@ +/* Intel SIMD (SSE2) implementations of Viterbi ACS butterflies + for 64-state (k=7) convolutional code + Copyright 2003 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi27_blk_sse2(struct v27 *vp,unsigned char syms[],int nbits) ; +*/ + # SSE2 (128-bit integer SIMD) version + # Requires Pentium 4 or better + + # These are offsets into struct v27, defined in viterbi27.h + .set DP,128 + .set OLDMETRICS,132 + .set NEWMETRICS,136 + .text + .global update_viterbi27_blk_sse2,Branchtab27_sse2 + .type update_viterbi27_blk_sse2,@function + .align 16 + +update_viterbi27_blk_sse2: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # ebx = syms + movb (%ebx),%al + movd %eax,%xmm6 # xmm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%xmm5 # xmm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] + punpcklbw %xmm5,%xmm5 + pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 + pshuflw $0,%xmm5,%xmm5 + punpcklqdq %xmm6,%xmm6 # propagate to all 16 + punpcklqdq %xmm5,%xmm5 + # xmm6 now contains first symbol in each byte, xmm5 the second + + movdqa thirtyones,%xmm7 + + # each invocation of this macro does 16 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movdqa Branchtab27_sse2+(16*\GROUP),%xmm4 + movdqa Branchtab27_sse2+32+(16*\GROUP),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + + # compute 5-bit branch metric in xmm4 by adding the individual symbol metrics + # This is okay for this + # code because the worst-case metric spread (at high Eb/No) is only 120, + # well within the range of our unsigned 8-bit path metrics, and even within + # the range of signed 8-bit path metrics + pavgb %xmm3,%xmm4 + psrlw $3,%xmm4 + + pand %xmm7,%xmm4 + + movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 + movdqa ((16*\GROUP)+32)(%esi),%xmm3 # Incoming path metric, high bit = 1 + movdqa %xmm0,%xmm2 + movdqa %xmm3,%xmm1 + paddusb %xmm4,%xmm0 # note use of saturating arithmetic + paddusb %xmm4,%xmm3 # this shouldn't be necessary, but why not? + + # negate branch metrics + pxor %xmm7,%xmm4 + paddusb %xmm4,%xmm1 + paddusb %xmm4,%xmm2 + + # Find survivors, leave in mm0,2 + pminub %xmm1,%xmm0 + pminub %xmm3,%xmm2 + # get decisions, leave in mm1,3 + pcmpeqb %xmm0,%xmm1 + pcmpeqb %xmm2,%xmm3 + + # interleave and store new branch metrics in mm0,2 + movdqa %xmm0,%xmm4 + punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics + punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics + movdqa %xmm0,(32*\GROUP+16)(%edi) + movdqa %xmm4,(32*\GROUP)(%edi) + + # interleave decisions & store + movdqa %xmm1,%xmm4 + punpckhbw %xmm3,%xmm1 + punpcklbw %xmm3,%xmm4 + # work around bug in gas due to Intel doc error + .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx + shll $16,%ebx + .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax + orl %eax,%ebx + movl %ebx,(4*\GROUP)(%edx) + .endm + + # invoke macro 2 times for a total of 32 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + + addl $8,%edx # bump decision pointer + + # See if we have to normalize. This requires an explanation. We don't want + # our path metrics to exceed 255 on the *next* iteration. Since the + # largest branch metric is 30, that means we don't want any to exceed 225 + # on *this* iteration. Rather than look them all, we just pick an arbitrary one + # (the first) and see if it exceeds 225-120=105, where 120 is the experimentally- + # determined worst-case metric spread for this code and branch metrics in the range 0-30. + + # This is extremely conservative, and empirical testing at a variety of Eb/Nos might + # show that a higher threshold could be used without affecting BER performance + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmp $105,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics. We can't just pick an arbitrary small constant because + # the minimum metric might be zero! + movdqa (%edi),%xmm0 + movdqa %xmm0,%xmm4 + movdqa 16(%edi),%xmm1 + pminub %xmm1,%xmm4 + movdqa 32(%edi),%xmm2 + pminub %xmm2,%xmm4 + movdqa 48(%edi),%xmm3 + pminub %xmm3,%xmm4 + + # crunch down to single lowest metric + movdqa %xmm4,%xmm5 + psrldq $8,%xmm5 # the count to psrldq is bytes, not bits! + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $32,%xmm5 + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $16,%xmm5 + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $8,%xmm5 + pminub %xmm5,%xmm4 # now in lowest byte of %xmm4 + + punpcklbw %xmm4,%xmm4 # lowest 2 bytes + pshuflw $0,%xmm4,%xmm4 # lowest 8 bytes + punpcklqdq %xmm4,%xmm4 # all 16 bytes + + # xmm4 now contains lowest metric in all 16 bytes + # subtract it from every output metric + psubusb %xmm4,%xmm0 + psubusb %xmm4,%xmm1 + psubusb %xmm4,%xmm2 + psubusb %xmm4,%xmm3 + movdqa %xmm0,(%edi) + movdqa %xmm1,16(%edi) + movdqa %xmm2,32(%edi) + movdqa %xmm3,48(%edi) + +done: + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 16 + +thirtyones: + .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 diff --git a/libfec/sse2bfly29-64.s b/libfec/sse2bfly29-64.s new file mode 100644 index 0000000..22bd8a1 --- /dev/null +++ b/libfec/sse2bfly29-64.s @@ -0,0 +1,254 @@ +/* Intel SIMD SSE2 implementation of Viterbi ACS butterflies + for 256-state (k=9) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + Modifications for x86_64, 2012 Matthias P. Braendli, HB9EGM + - changed registers to x86-64 equivalents + - changed instructions accordingly + - %rip indirect addressing needed for position independent code, + which is required because x86-64 needs dynamic libs to be PIC. + That still doesn't work + + void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ; +*/ + # SSE2 (128-bit integer SIMD) version + # All X86-64 CPUs include SSE2 + + # These are offsets into struct v29, defined in viterbi29_av.c + .set DP,512 + .set OLDMETRICS,516 + .set NEWMETRICS,520 + + .text + .global update_viterbi29_blk_sse2,Branchtab29_sse2 + .type update_viterbi29_blk_sse2,@function + .align 16 + +update_viterbi29_blk_sse2: + pushq %rbp + movq %rsp,%rbp + /* convention different between i386 and x86_64: rsi and rdi belong to called function, not caller */ + /* Let's say we don't care (yet) */ + pushq %rsi + pushq %rdi + pushq %rdx + pushq %rbx + + movq 8(%rbp),%rdx # edx = vp + testq %rdx,%rdx + jnz 0f + movq -1,%rax + jmp err +0: movq OLDMETRICS(%rdx),%rsi # esi -> old metrics + movq NEWMETRICS(%rdx),%rdi # edi -> new metrics + movq DP(%rdx),%rdx # edx -> decisions + +1: movq 16(%rbp),%rax # eax = nbits + decq %rax + jl 2f # passed zero, we're done + movq %rax,16(%rbp) + + xorq %rax,%rax + movq 12(%rbp),%rbx # ebx = syms + movb (%rbx),%al + movd %rax,%xmm6 # xmm6[0] = first symbol + movb 1(%rbx),%al + movd %rax,%xmm5 # xmm5[0] = second symbol + addq $2,%rbx + movq %rbx,12(%rbp) + + punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] + punpcklbw %xmm5,%xmm5 + movdqa thirtyones(%rip),%xmm7 + pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 + pshuflw $0,%xmm5,%xmm5 + punpcklqdq %xmm6,%xmm6 # propagate to all 16 + punpcklqdq %xmm5,%xmm5 + # xmm6 now contains first symbol in each byte, xmm5 the second + + movdqa thirtyones(%rip),%xmm7 + + # each invocation of this macro does 16 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movdqa Branchtab29_sse2+(16*\GROUP)(%rip),%xmm4 + movdqa Branchtab29_sse2+128+(16*\GROUP)(%rip),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + pavgb %xmm3,%xmm4 + psrlw $3,%xmm4 + + pand %xmm7,%xmm4 # xmm4 contains branch metrics + + movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 + movdqa ((16*\GROUP)+128)(%esi),%xmm3 # Incoming path metric, high bit = 1 + movdqa %xmm0,%xmm2 + movdqa %xmm3,%xmm1 + paddusb %xmm4,%xmm0 + paddusb %xmm4,%xmm3 + + # invert branch metrics + pxor %xmm7,%xmm4 + + paddusb %xmm4,%xmm1 + paddusb %xmm4,%xmm2 + + # Find survivors, leave in mm0,2 + pminub %xmm1,%xmm0 + pminub %xmm3,%xmm2 + # get decisions, leave in mm1,3 + pcmpeqb %xmm0,%xmm1 + pcmpeqb %xmm2,%xmm3 + + # interleave and store new branch metrics in mm0,2 + movdqa %xmm0,%xmm4 + punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics + punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics + movdqa %xmm0,(32*\GROUP+16)(%rdi) + movdqa %xmm4,(32*\GROUP)(%rdi) + + # interleave decisions & store + movdqa %xmm1,%xmm4 + punpckhbw %xmm3,%xmm1 + punpcklbw %xmm3,%xmm4 + # work around bug in gas due to Intel doc error + .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx + shlq $16,%rbx + .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax + orq %rax,%rbx + movq %rbx,(4*\GROUP)(%rdx) + .endm + + # invoke macro 8 times for a total of 128 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + butterfly GROUP=2 + butterfly GROUP=3 + butterfly GROUP=4 + butterfly GROUP=5 + butterfly GROUP=6 + butterfly GROUP=7 + + addq $32,%rdx # bump decision pointer + + # see if we have to normalize + movq (%rdi),%rax # extract first output metric + andq $255,%rax + cmp $50,%rax # is it greater than 50? + movq $0,%rax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movdqa (%rdi),%xmm0 + pminub 16(%rdi),%xmm0 + pminub 32(%rdi),%xmm0 + pminub 48(%rdi),%xmm0 + pminub 64(%rdi),%xmm0 + pminub 80(%rdi),%xmm0 + pminub 96(%rdi),%xmm0 + pminub 112(%rdi),%xmm0 + pminub 128(%rdi),%xmm0 + pminub 144(%rdi),%xmm0 + pminub 160(%rdi),%xmm0 + pminub 176(%rdi),%xmm0 + pminub 192(%rdi),%xmm0 + pminub 208(%rdi),%xmm0 + pminub 224(%rdi),%xmm0 + pminub 240(%rdi),%xmm0 + + # crunch down to single lowest metric + movdqa %xmm0,%xmm1 + psrldq $8,%xmm0 # the count to psrldq is bytes, not bits! + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $32,%xmm0 + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $16,%xmm0 + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $8,%xmm0 + pminub %xmm1,%xmm0 + + punpcklbw %xmm0,%xmm0 # lowest 2 bytes + pshuflw $0,%xmm0,%xmm0 # lowest 8 bytes + punpcklqdq %xmm0,%xmm0 # all 16 bytes + + # xmm0 now contains lowest metric in all 16 bytes + # subtract it from every output metric + movdqa (%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,(%rdi) + movdqa 16(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,16(%rdi) + movdqa 32(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,32(%rdi) + movdqa 48(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,48(%rdi) + movdqa 64(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,64(%rdi) + movdqa 80(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,80(%rdi) + movdqa 96(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,96(%rdi) + movdqa 112(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,112(%rdi) + movdqa 128(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,128(%rdi) + movdqa 144(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,144(%rdi) + movdqa 160(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,160(%rdi) + movdqa 176(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,176(%rdi) + movdqa 192(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,192(%rdi) + movdqa 208(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,208(%rdi) + movdqa 224(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,224(%rdi) + movdqa 240(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,240(%rdi) + +done: + # swap metrics + movq %rsi,%rax + movq %rdi,%rsi + movq %rax,%rdi + jmp 1b + +2: movq 8(%rbp),%rbx # ebx = vp + # stash metric pointers + movq %rsi,OLDMETRICS(%rbx) + movq %rdi,NEWMETRICS(%rbx) + movq %rdx,DP(%rbx) # stash incremented value of vp->dp + xorq %rax,%rax +err: popq %rbx + popq %rdx + popq %rdi + popq %rsi + popq %rbp + ret + + .data + .align 16 +thirtyones: + .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 + diff --git a/libfec/sse2bfly29.s b/libfec/sse2bfly29.s new file mode 100644 index 0000000..0fa1742 --- /dev/null +++ b/libfec/sse2bfly29.s @@ -0,0 +1,245 @@ +/* Intel SIMD SSE2 implementation of Viterbi ACS butterflies + for 256-state (k=9) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ; +*/ + + # SSE2 (128-bit integer SIMD) version + # Requires Pentium 4 or better + # These are offsets into struct v29, defined in viterbi29.h + .set DP,512 + .set OLDMETRICS,516 + .set NEWMETRICS,520 + + .text + .global update_viterbi29_blk_sse2,Branchtab29_sse2 + .type update_viterbi29_blk_sse2,@function + .align 16 + +update_viterbi29_blk_sse2: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # ebx = syms + movb (%ebx),%al + movd %eax,%xmm6 # xmm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%xmm5 # xmm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] + punpcklbw %xmm5,%xmm5 + movdqa thirtyones,%xmm7 + pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 + pshuflw $0,%xmm5,%xmm5 + punpcklqdq %xmm6,%xmm6 # propagate to all 16 + punpcklqdq %xmm5,%xmm5 + # xmm6 now contains first symbol in each byte, xmm5 the second + + movdqa thirtyones,%xmm7 + + # each invocation of this macro does 16 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movdqa Branchtab29_sse2+(16*\GROUP),%xmm4 + movdqa Branchtab29_sse2+128+(16*\GROUP),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + pavgb %xmm3,%xmm4 + psrlw $3,%xmm4 + + pand %xmm7,%xmm4 # xmm4 contains branch metrics + + movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 + movdqa ((16*\GROUP)+128)(%esi),%xmm3 # Incoming path metric, high bit = 1 + movdqa %xmm0,%xmm2 + movdqa %xmm3,%xmm1 + paddusb %xmm4,%xmm0 + paddusb %xmm4,%xmm3 + + # invert branch metrics + pxor %xmm7,%xmm4 + + paddusb %xmm4,%xmm1 + paddusb %xmm4,%xmm2 + + # Find survivors, leave in mm0,2 + pminub %xmm1,%xmm0 + pminub %xmm3,%xmm2 + # get decisions, leave in mm1,3 + pcmpeqb %xmm0,%xmm1 + pcmpeqb %xmm2,%xmm3 + + # interleave and store new branch metrics in mm0,2 + movdqa %xmm0,%xmm4 + punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics + punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics + movdqa %xmm0,(32*\GROUP+16)(%edi) + movdqa %xmm4,(32*\GROUP)(%edi) + + # interleave decisions & store + movdqa %xmm1,%xmm4 + punpckhbw %xmm3,%xmm1 + punpcklbw %xmm3,%xmm4 + # work around bug in gas due to Intel doc error + .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx + shll $16,%ebx + .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax + orl %eax,%ebx + movl %ebx,(4*\GROUP)(%edx) + .endm + + # invoke macro 8 times for a total of 128 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + butterfly GROUP=2 + butterfly GROUP=3 + butterfly GROUP=4 + butterfly GROUP=5 + butterfly GROUP=6 + butterfly GROUP=7 + + addl $32,%edx # bump decision pointer + + # see if we have to normalize + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmp $50,%eax # is it greater than 50? + movl $0,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movdqa (%edi),%xmm0 + pminub 16(%edi),%xmm0 + pminub 32(%edi),%xmm0 + pminub 48(%edi),%xmm0 + pminub 64(%edi),%xmm0 + pminub 80(%edi),%xmm0 + pminub 96(%edi),%xmm0 + pminub 112(%edi),%xmm0 + pminub 128(%edi),%xmm0 + pminub 144(%edi),%xmm0 + pminub 160(%edi),%xmm0 + pminub 176(%edi),%xmm0 + pminub 192(%edi),%xmm0 + pminub 208(%edi),%xmm0 + pminub 224(%edi),%xmm0 + pminub 240(%edi),%xmm0 + + # crunch down to single lowest metric + movdqa %xmm0,%xmm1 + psrldq $8,%xmm0 # the count to psrldq is bytes, not bits! + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $32,%xmm0 + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $16,%xmm0 + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $8,%xmm0 + pminub %xmm1,%xmm0 + + punpcklbw %xmm0,%xmm0 # lowest 2 bytes + pshuflw $0,%xmm0,%xmm0 # lowest 8 bytes + punpcklqdq %xmm0,%xmm0 # all 16 bytes + + # xmm0 now contains lowest metric in all 16 bytes + # subtract it from every output metric + movdqa (%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,(%edi) + movdqa 16(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,16(%edi) + movdqa 32(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,32(%edi) + movdqa 48(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,48(%edi) + movdqa 64(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,64(%edi) + movdqa 80(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,80(%edi) + movdqa 96(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,96(%edi) + movdqa 112(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,112(%edi) + movdqa 128(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,128(%edi) + movdqa 144(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,144(%edi) + movdqa 160(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,160(%edi) + movdqa 176(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,176(%edi) + movdqa 192(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,192(%edi) + movdqa 208(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,208(%edi) + movdqa 224(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,224(%edi) + movdqa 240(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,240(%edi) + +done: + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 16 +thirtyones: + .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 + diff --git a/libfec/ssebfly27.s b/libfec/ssebfly27.s new file mode 100644 index 0000000..7f445da --- /dev/null +++ b/libfec/ssebfly27.s @@ -0,0 +1,205 @@ +/* Intel SIMD (SSE) implementation of Viterbi ACS butterflies + for 64-state (k=7) convolutional code + Copyright 2001 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ; +*/ + + # SSE (64-bit integer SIMD) version + # Requires Pentium III or better + + # These are offsets into struct v27, defined in viterbi27.h + .set DP,128 + .set OLDMETRICS,132 + .set NEWMETRICS,136 +.text +.global update_viterbi27_blk_sse,Branchtab27_sse + .type update_viterbi27_blk_sse,@function + .align 16 + +update_viterbi27_blk_sse: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # %ebx = syms + movb (%ebx),%al + movd %eax,%mm6 # mm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%mm5 # mm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %mm6,%mm6 # mm6[1] = mm6[0] + punpcklbw %mm5,%mm5 + movq thirtyones,%mm7 + + pshufw $0,%mm6,%mm6 # copy low word to upper 3 + pshufw $0,%mm5,%mm5 + # mm6 now contains first symbol in each byte, mm5 the second + + # each invocation of this macro does 8 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movq Branchtab27_sse+(8*\GROUP),%mm4 + movq Branchtab27_sse+32+(8*\GROUP),%mm3 + pxor %mm6,%mm4 + pxor %mm5,%mm3 + pavgb %mm3,%mm4 # mm4 contains branch metrics + psrlw $3,%mm4 + pand %mm7,%mm4 + + movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+32)(%esi),%mm3 # Incoming path metric, high bit = 1 + movq %mm0,%mm2 + movq %mm3,%mm1 + paddusb %mm4,%mm0 + paddusb %mm4,%mm3 + + # invert branch metrics. This works only because they're 5 bits + pxor %mm7,%mm4 + + paddusb %mm4,%mm1 + paddusb %mm4,%mm2 + + # Find survivors, leave in mm0,2 + pminub %mm1,%mm0 + pminub %mm3,%mm2 + # get decisions, leave in mm1,3 + pcmpeqb %mm0,%mm1 + pcmpeqb %mm2,%mm3 + + # interleave and store new branch metrics in mm0,2 + movq %mm0,%mm4 + punpckhbw %mm2,%mm0 # interleave second 8 new metrics + punpcklbw %mm2,%mm4 # interleave first 8 new metrics + movq %mm0,(16*\GROUP+8)(%edi) + movq %mm4,(16*\GROUP)(%edi) + + # interleave decisions, accumulate into %ebx + movq %mm1,%mm4 + punpckhbw %mm3,%mm1 + punpcklbw %mm3,%mm4 + # Due to an error in the Intel instruction set ref (the register + # fields are swapped), gas assembles pmovmskb incorrectly + # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html + .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax + shll $((16*\GROUP+8)&31),%eax + orl %eax,%ebx + .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax + shll $((16*\GROUP)&31),%eax + orl %eax,%ebx + .endm + + # invoke macro 4 times for a total of 32 butterflies + xorl %ebx,%ebx # clear decisions + butterfly GROUP=0 + butterfly GROUP=1 + movl %ebx,(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=2 + butterfly GROUP=3 + movl %ebx,4(%edx) # stash second 32 decisions + + addl $8,%edx # bump decision pointer + + # see if we have to normalize + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmpl $150,%eax # is it greater than 150? + movl $0,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movq (%edi),%mm0 + pminub 8(%edi),%mm0 + pminub 16(%edi),%mm0 + pminub 24(%edi),%mm0 + pminub 32(%edi),%mm0 + pminub 40(%edi),%mm0 + pminub 48(%edi),%mm0 + pminub 56(%edi),%mm0 + # mm0 contains 8 smallest metrics + # crunch down to single lowest metric + movq %mm0,%mm1 + psrlq $32,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $16,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $8,%mm0 + pminub %mm1,%mm0 + punpcklbw %mm0,%mm0 # expand to all 8 bytes + pshufw $0,%mm0,%mm0 + + # mm0 now contains lowest metric in all 8 bytes + # subtract it from every output metric + # Trashes %mm7 + .macro PSUBUSBM REG,MEM + movq \MEM,%mm7 + psubusb \REG,%mm7 + movq %mm7,\MEM + .endm + + PSUBUSBM %mm0,(%edi) + PSUBUSBM %mm0,8(%edi) + PSUBUSBM %mm0,16(%edi) + PSUBUSBM %mm0,24(%edi) + PSUBUSBM %mm0,32(%edi) + PSUBUSBM %mm0,40(%edi) + PSUBUSBM %mm0,48(%edi) + PSUBUSBM %mm0,56(%edi) + + movd %mm0,%eax + and $0xff,%eax + +done: # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + + ret + + .data + + .align 16 +thirtyones: + .byte 31,31,31,31,31,31,31,31 + + + diff --git a/libfec/ssebfly29.s b/libfec/ssebfly29.s new file mode 100644 index 0000000..d7d2149 --- /dev/null +++ b/libfec/ssebfly29.s @@ -0,0 +1,271 @@ +/* Intel SIMD SSE implementation of Viterbi ACS butterflies + for 256-state (k=9) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits); +*/ + # SSE (64-bit integer SIMD) version + # Requires Pentium III or better + # These are offsets into struct v29, defined in viterbi29.h + .set DP,512 + .set OLDMETRICS,516 + .set NEWMETRICS,520 + .text + .global update_viterbi29_blk_sse,Branchtab29_sse + .type update_viterbi29_blk_sse,@function + .align 16 + +update_viterbi29_blk_sse: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # ebx = syms + movb (%ebx),%al + movd %eax,%mm6 # mm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%mm5 # mm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %mm6,%mm6 # mm6[1] = mm6[0] + punpcklbw %mm5,%mm5 + + movq thirtyones,%mm7 + pshufw $0,%mm6,%mm6 # copy low word to upper 3 + pshufw $0,%mm5,%mm5 + # mm6 now contains first symbol in each byte, mm5 the second + + # each invocation of this macro does 8 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movq Branchtab29_sse+(8*\GROUP),%mm4 + movq Branchtab29_sse+128+(8*\GROUP),%mm3 + pxor %mm6,%mm4 + pxor %mm5,%mm3 + pavgb %mm3,%mm4 # mm4 contains branch metrics + psrlw $3,%mm4 + pand %mm7,%mm4 + + movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+128)(%esi),%mm3 # Incoming path metric, high bit = 1 + movq %mm0,%mm2 + movq %mm3,%mm1 + paddusb %mm4,%mm0 + paddusb %mm4,%mm3 + + # invert branch metrics. This works only because they're 5 bits + pxor %mm7,%mm4 + + paddusb %mm4,%mm1 + paddusb %mm4,%mm2 + + # Find survivors, leave in mm0,2 + pminub %mm1,%mm0 + pminub %mm3,%mm2 + # get decisions, leave in mm1,3 + pcmpeqb %mm0,%mm1 + pcmpeqb %mm2,%mm3 + + # interleave and store new branch metrics in mm0,2 + movq %mm0,%mm4 + punpckhbw %mm2,%mm0 # interleave second 8 new metrics + punpcklbw %mm2,%mm4 # interleave first 8 new metrics + movq %mm0,(16*\GROUP+8)(%edi) + movq %mm4,(16*\GROUP)(%edi) + + # interleave decisions, accumulate into %ebx + movq %mm1,%mm4 + punpckhbw %mm3,%mm1 + punpcklbw %mm3,%mm4 + # Due to an error in the Intel instruction set ref (the register + # fields are swapped), gas assembles pmovmskb incorrectly + # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html + .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax + shll $((16*\GROUP+8)&31),%eax + orl %eax,%ebx + .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax + shll $((16*\GROUP)&31),%eax + orl %eax,%ebx + .endm + + # invoke macro 16 times for a total of 128 butterflies + xorl %ebx,%ebx # clear decisions + butterfly GROUP=0 + butterfly GROUP=1 + movl %ebx,(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=2 + butterfly GROUP=3 + movl %ebx,4(%edx) # stash second 32 decisions + xorl %ebx,%ebx # clear decisions + butterfly GROUP=4 + butterfly GROUP=5 + movl %ebx,8(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=6 + butterfly GROUP=7 + movl %ebx,12(%edx) # stash second 32 decisions + xorl %ebx,%ebx # clear decisions + butterfly GROUP=8 + butterfly GROUP=9 + movl %ebx,16(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=10 + butterfly GROUP=11 + movl %ebx,20(%edx) # stash second 32 decisions + xorl %ebx,%ebx # clear decisions + butterfly GROUP=12 + butterfly GROUP=13 + movl %ebx,24(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=14 + butterfly GROUP=15 + movl %ebx,28(%edx) # stash second 32 decisions + + addl $32,%edx # bump decision pointer + + # see if we have to normalize + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmp $50,%eax # is it greater than 50? + movl $0,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movq (%edi),%mm0 + pminub 8(%edi),%mm0 + pminub 16(%edi),%mm0 + pminub 24(%edi),%mm0 + pminub 32(%edi),%mm0 + pminub 40(%edi),%mm0 + pminub 48(%edi),%mm0 + pminub 56(%edi),%mm0 + pminub 64(%edi),%mm0 + pminub 72(%edi),%mm0 + pminub 80(%edi),%mm0 + pminub 88(%edi),%mm0 + pminub 96(%edi),%mm0 + pminub 104(%edi),%mm0 + pminub 112(%edi),%mm0 + pminub 120(%edi),%mm0 + pminub 128(%edi),%mm0 + pminub 136(%edi),%mm0 + pminub 144(%edi),%mm0 + pminub 152(%edi),%mm0 + pminub 160(%edi),%mm0 + pminub 168(%edi),%mm0 + pminub 176(%edi),%mm0 + pminub 184(%edi),%mm0 + pminub 192(%edi),%mm0 + pminub 200(%edi),%mm0 + pminub 208(%edi),%mm0 + pminub 216(%edi),%mm0 + pminub 224(%edi),%mm0 + pminub 232(%edi),%mm0 + pminub 240(%edi),%mm0 + pminub 248(%edi),%mm0 + # mm0 contains 8 smallest metrics + # crunch down to single lowest metric + movq %mm0,%mm1 + psrlq $32,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $16,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $8,%mm0 + pminub %mm1,%mm0 + movq 8(%edi),%mm1 # reload + punpcklbw %mm0,%mm0 # expand to all 8 bytes + pshufw $0,%mm0,%mm0 + + # mm0 now contains lowest metric in all 8 bytes + # subtract it from every output metric + # Trashes %mm7 + .macro PSUBUSBM REG,MEM + movq \MEM,%mm7 + psubusb \REG,%mm7 + movq %mm7,\MEM + .endm + + PSUBUSBM %mm0,(%edi) + PSUBUSBM %mm0,8(%edi) + PSUBUSBM %mm0,16(%edi) + PSUBUSBM %mm0,24(%edi) + PSUBUSBM %mm0,32(%edi) + PSUBUSBM %mm0,40(%edi) + PSUBUSBM %mm0,48(%edi) + PSUBUSBM %mm0,56(%edi) + PSUBUSBM %mm0,64(%edi) + PSUBUSBM %mm0,72(%edi) + PSUBUSBM %mm0,80(%edi) + PSUBUSBM %mm0,88(%edi) + PSUBUSBM %mm0,96(%edi) + PSUBUSBM %mm0,104(%edi) + PSUBUSBM %mm0,112(%edi) + PSUBUSBM %mm0,120(%edi) + PSUBUSBM %mm0,128(%edi) + PSUBUSBM %mm0,136(%edi) + PSUBUSBM %mm0,144(%edi) + PSUBUSBM %mm0,152(%edi) + PSUBUSBM %mm0,160(%edi) + PSUBUSBM %mm0,168(%edi) + PSUBUSBM %mm0,176(%edi) + PSUBUSBM %mm0,184(%edi) + PSUBUSBM %mm0,192(%edi) + PSUBUSBM %mm0,200(%edi) + PSUBUSBM %mm0,208(%edi) + PSUBUSBM %mm0,216(%edi) + PSUBUSBM %mm0,224(%edi) + PSUBUSBM %mm0,232(%edi) + PSUBUSBM %mm0,240(%edi) + PSUBUSBM %mm0,248(%edi) + +done: + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 8 +thirtyones: + .byte 31,31,31,31,31,31,31,31 + + diff --git a/libfec/sumsq.c b/libfec/sumsq.c new file mode 100644 index 0000000..e567c89 --- /dev/null +++ b/libfec/sumsq.c @@ -0,0 +1,50 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include +#include "fec.h" + +unsigned long long sumsq_port(signed short *,int); + +#ifdef __i386__ +unsigned long long sumsq_mmx(signed short *,int); +unsigned long long sumsq_sse(signed short *,int); +unsigned long long sumsq_sse2(signed short *,int); +#endif + +#ifdef __x86_64__ +unsigned long long sumsq_sse2(signed short *,int); +#endif + +#ifdef __VEC__ +unsigned long long sumsq_av(signed short *,int); +#endif + +unsigned long long sumsq(signed short *in,int cnt){ + switch(Cpu_mode){ + case PORT: + default: + return sumsq_port(in,cnt); +#ifdef __i386__ + case SSE: + case MMX: + return sumsq_mmx(in,cnt); + case SSE2: + return sumsq_sse2(in,cnt); +#endif + +#ifdef __x86_64__ + case SSE2: + return sumsq_port(in,cnt); + //return sumsq_sse2(in,cnt); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return sumsq_av(in,cnt); +#endif + } +} diff --git a/libfec/sumsq_av.c b/libfec/sumsq_av.c new file mode 100644 index 0000000..53c6acf --- /dev/null +++ b/libfec/sumsq_av.c @@ -0,0 +1,78 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * This is the Altivec SIMD version. It's a little hairy because Altivec + * does not do 64-bit operations directly, so we have to accumulate separate + * 32-bit sums and carries + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include "fec.h" + +unsigned long long sumsq_av(signed short *in,int cnt){ + long long sum; + vector signed short x; + vector unsigned int sums,carries,s1,s2; + int pad; + union { vector unsigned char cv; vector unsigned int iv; unsigned int w[4]; unsigned char c[16];} s; + + carries = sums = (vector unsigned int)(0); + if((pad = (int)in & 15)!=0){ + /* Load unaligned leading word */ + x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in)); + if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */ + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + } + sums = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); + in += 8-pad/2; + cnt -= 8-pad/2; + } + /* Everything is now aligned, rip through most of the block */ + while(cnt >= 8){ + x = vec_ld(0,in); + /* A single vec_msum cannot overflow, but we have to sum it with + * the earlier terms separately to handle the carries + * The cast to unsigned is OK because squares are always positive + */ + s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + in += 8; + cnt -= 8; + } + /* Handle trailing fragment, if any */ + if(cnt > 0){ + x = vec_ld(0,in); + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + } + /* Combine 4 sub-sums and carries */ + s.c[15] = 64; /* Shift right two 32-bit words */ + s1 = vec_sro(sums,s.cv); + s2 = vec_sro(carries,s.cv); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + carries = vec_add(carries,s2); + + s.c[15] = 32; /* Shift right one 32-bit word */ + s1 = vec_sro(sums,s.cv); + s2 = vec_sro(carries,s.cv); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + carries = vec_add(carries,s2); + + /* Extract sum and carries from right-hand words and combine into result */ + s.iv = sums; + sum = s.w[3]; + + s.iv = carries; + sum += (long long)s.w[3] << 32; + + return sum; +} + diff --git a/libfec/sumsq_mmx.c b/libfec/sumsq_mmx.c new file mode 100644 index 0000000..e766831 --- /dev/null +++ b/libfec/sumsq_mmx.c @@ -0,0 +1,35 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * MMX-assisted version (also used on SSE) + + * The SSE2 and MMX assist routines both operate on multiples of + * 8 words; they differ only in their alignment requirements (8 bytes + * for MMX, 16 bytes for SSE2) + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser Public License (LGPL) + */ + +long long sumsq_mmx_assist(signed short *,int); + +long long sumsq_mmx(signed short *in,int cnt){ + long long sum = 0; + + /* Handle stuff before the next 8-byte boundary */ + while(((int)in & 7) != 0 && cnt != 0){ + sum += (long)in[0] * in[0]; + in++; + cnt--; + } + sum += sumsq_mmx_assist(in,cnt); + in += cnt & ~7; + cnt &= 7; + + /* Handle up to 7 words at end */ + while(cnt != 0){ + sum += (long)in[0] * in[0]; + in++; + cnt--; + } + return sum; +} diff --git a/libfec/sumsq_mmx_assist.s b/libfec/sumsq_mmx_assist.s new file mode 100644 index 0000000..b3bac66 --- /dev/null +++ b/libfec/sumsq_mmx_assist.s @@ -0,0 +1,83 @@ +# MMX assist routines for sumsq +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Public License (GPL) + + .text + +# Evaluate sum of squares of signed 16-bit input samples +# long long sumsq_mmx_assist(signed short *in,int cnt); + .global sumsq_mmx_assist + .type sumsq_mmx_assist,@function + .align 16 +sumsq_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + pushl %ebx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + xor %eax,%eax + xor %edx,%edx + + # Since 4 * 32767**2 < 2**32, we can accumulate two at a time +1: subl $8,%ecx + jl 2f + movq (%esi),%mm0 # S0 S1 S2 S3 + pmaddwd %mm0,%mm0 # (S0^2+S1^2) (S2^2+S3^2) + movq 8(%esi),%mm6 # S4 S5 S6 S7 + pmaddwd %mm6,%mm6 # (S4^2+S5^2) (S6^2+S7^2) + paddd %mm6,%mm0 # (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2) + movd %mm0,%ebx + addl %ebx,%eax + adcl $0,%edx + psrlq $32,%mm0 + movd %mm0,%ebx + addl %ebx,%eax + adcl $0,%edx + addl $16,%esi + jmp 1b + +2: emms + popl %ebx + popl %ecx + popl %esi + popl %ebp + ret + +# Evaluate sum of squares of signed 16-bit input samples +# long sumsq_wd_mmx_assist(signed short *in,int cnt); +# Quick version, only safe for small numbers of small input values... + .global sumsq_wd_mmx_assist + .type sumsq_wd_mmx_assist,@function + .align 16 +sumsq_wd_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + pxor %mm2,%mm2 # zero sum + +1: subl $8,%ecx + jl 2f + movq (%esi),%mm0 # S0 S1 S2 S3 + pmaddwd %mm0,%mm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) + movq 8(%esi),%mm1 + pmaddwd %mm1,%mm1 + paddd %mm1,%mm2 + paddd %mm0,%mm2 # accumulate + + addl $16,%esi + jmp 1b + +2: movd %mm2,%eax # even sum + psrlq $32,%mm2 + movd %mm2,%edx # odd sum + addl %edx,%eax + emms + popl %esi + popl %ebp + ret diff --git a/libfec/sumsq_port.c b/libfec/sumsq_port.c new file mode 100644 index 0000000..6d0b4c1 --- /dev/null +++ b/libfec/sumsq_port.c @@ -0,0 +1,16 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * Portable C version + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +unsigned long long sumsq_port(signed short *in,int cnt){ + long long sum = 0; + int i; + + for(i=0;i +#include +#include +#include +#include "config.h" +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +int Verbose = 0; + +int main(int argc,char *argv[]){ + signed short *buf; + int i,d,trial,trials=10000; + int bufsize = 2048; + long long port_sum,simd_sum; + time_t t; + int timetrials=0; + + find_cpu_mode(); + time(&t); + srandom(t); + +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"vapmstl:n:T",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"vapmstl:n:T")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + bufsize = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'v': + Verbose++; + break; + case 'T': + timetrials++; + break; + } + } + + buf = (signed short *)calloc(bufsize,sizeof(signed short)); + if(timetrials){ + for(trial=0;trial +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27(int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi27_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi27_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi27_mmx(len); + case SSE: + return create_viterbi27_sse(len); + case SSE2: + return create_viterbi27_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi27_port(len); +#endif + } +} + +void set_viterbi27_polynomial(int polys[2]){ + switch(Cpu_mode){ + case PORT: + default: + set_viterbi27_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi27_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi27_polynomial_mmx(polys); + break; + case SSE: + set_viterbi27_polynomial_sse(polys); + break; + case SSE2: + set_viterbi27_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi27_polynomial_port(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi27_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi27_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi27_mmx(p,starting_state); + case SSE: + return init_viterbi27_sse(p,starting_state); + case SSE2: + return init_viterbi27_sse2(p,starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi27_port(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi27( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi27_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi27_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi27_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi27_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi27_sse2(p,data,nbits,endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi27_port(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi27_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi27_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi27_mmx(p); + break; + case SSE: + delete_viterbi27_sse(p); + break; + case SSE2: + delete_viterbi27_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi27_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi27_blk(void *p,unsigned char syms[],int nbits){ + if(p == NULL) + return -1; + + switch(Cpu_mode){ + case PORT: + default: + update_viterbi27_blk_port(p,syms,nbits); + break; +#ifdef __VEC__ + case ALTIVEC: + update_viterbi27_blk_av(p,syms,nbits); + break; +#endif +#ifdef __i386__ + case MMX: + update_viterbi27_blk_mmx(p,syms,nbits); + break; + case SSE: + update_viterbi27_blk_sse(p,syms,nbits); + break; + case SSE2: + update_viterbi27_blk_sse2(p,syms,nbits); + break; +#endif +#ifdef __x86_64__ + case SSE2: + update_viterbi27_blk_port(p,syms,nbits); + break; +#endif + } + return 0; +} diff --git a/libfec/viterbi27_av.c b/libfec/viterbi27_av.c new file mode 100644 index 0000000..98d7344 --- /dev/null +++ b/libfec/viterbi27_av.c @@ -0,0 +1,210 @@ +/* K=7 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec instructions + * Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +typedef union { long long p; unsigned char c[64]; vector bool char v[4]; } decision_t; +typedef union { long long p; unsigned char c[64]; vector unsigned char v[4]; } metric_t; + +static union branchtab27 { unsigned char c[32]; vector unsigned char v[2];} Branchtab27[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_av(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<4;i++) + vp->metrics1.v[i] = (vector unsigned char)(63); + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_av(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_av(int len){ + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA,V27POLYB }; + set_viterbi27_polynomial_av(polys); + } + if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL) + return NULL; + if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27_av(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + + if(p == NULL) + return -1; + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate>>2] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_av(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* Process received symbols */ +int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits){ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + vector unsigned char survivor0,survivor1,sym0v,sym1v; + vector bool char decision0,decision1; + vector unsigned char metric,m_metric,m0,m1,m2,m3; + void *tmp; + + /* sym0v.0 = syms[0]; sym0v.1 = syms[1] */ + sym0v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms)); + + sym1v = vec_splat(sym0v,1); /* Splat syms[1] across sym1v */ + sym0v = vec_splat(sym0v,0); /* Splat syms[0] across sym0v */ + syms += 2; + + /* Do the 32 butterflies as two interleaved groups of 16 each to keep the pipes full */ + + /* Form first set of 16 branch metrics */ + metric = vec_avg(vec_xor(Branchtab27[0].v[0],sym0v),vec_xor(Branchtab27[1].v[0],sym1v)); + metric = vec_sr(metric,(vector unsigned char)(3)); + m_metric = vec_sub((vector unsigned char)(31),metric); + + /* Form first set of path metrics */ + m0 = vec_adds(vp->old_metrics->v[0],metric); + m3 = vec_adds(vp->old_metrics->v[2],metric); + m1 = vec_adds(vp->old_metrics->v[2],m_metric); + m2 = vec_adds(vp->old_metrics->v[0],m_metric); + + /* Form second set of 16 branch metrics */ + metric = vec_avg(vec_xor(Branchtab27[0].v[1],sym0v),vec_xor(Branchtab27[1].v[1],sym1v)); + metric = vec_sr(metric,(vector unsigned char)(3)); + m_metric = vec_sub((vector unsigned char)(31),metric); + + /* Compare and select first set */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Compute second set of path metrics */ + m0 = vec_adds(vp->old_metrics->v[1],metric); + m3 = vec_adds(vp->old_metrics->v[3],metric); + m1 = vec_adds(vp->old_metrics->v[3],m_metric); + m2 = vec_adds(vp->old_metrics->v[1],m_metric); + + /* Interleave and store first decisions and survivors */ + d->v[0] = vec_mergeh(decision0,decision1); + d->v[1] = vec_mergel(decision0,decision1); + vp->new_metrics->v[0] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[1] = vec_mergel(survivor0,survivor1); + + /* Compare and select second set */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Interleave and store second set of decisions and survivors */ + d->v[2] = vec_mergeh(decision0,decision1); + d->v[3] = vec_mergel(decision0,decision1); + vp->new_metrics->v[2] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[3] = vec_mergel(survivor0,survivor1); + + /* renormalize if necessary */ + if(vp->new_metrics->c[0] >= 105){ + vector unsigned char scale0,scale1; + + /* Find smallest metric and splat */ + scale0 = vec_min(vp->new_metrics->v[0],vp->new_metrics->v[1]); + scale1 = vec_min(vp->new_metrics->v[2],vp->new_metrics->v[3]); + scale0 = vec_min(scale0,scale1); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,8)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,4)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,2)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,1)); + + /* Now subtract from all metrics */ + vp->new_metrics->v[0] = vec_subs(vp->new_metrics->v[0],scale0); + vp->new_metrics->v[1] = vec_subs(vp->new_metrics->v[1],scale0); + vp->new_metrics->v[2] = vec_subs(vp->new_metrics->v[2],scale0); + vp->new_metrics->v[3] = vec_subs(vp->new_metrics->v[3],scale0); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + + return 0; +} + diff --git a/libfec/viterbi27_mmx.c b/libfec/viterbi27_mmx.c new file mode 100644 index 0000000..a6d5125 --- /dev/null +++ b/libfec/viterbi27_mmx.c @@ -0,0 +1,115 @@ +/* K=7 r=1/2 Viterbi decoder for MMX + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { char c[64]; __m64 v[8];} decision_t; +typedef union { unsigned char c[64]; __m64 v[8];} metric_t; + +unsigned char Mettab27_1[256][32] __attribute__ ((aligned(16))); +unsigned char Mettab27_2[256][32] __attribute__ ((aligned(16))); +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in mmxbfly27.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_mmx(void *p,int starting_state){ + struct v27 *vp = (struct v27 *)p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_mmx(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + int symbol; + for(symbol = 0;symbol < 256;symbol++){ + int sym; + + sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0); + Mettab27_1[symbol][state] = (sym ? (255-symbol):symbol) / 16; + + sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0); + Mettab27_2[symbol][state] = (sym ? (255-symbol):symbol) / 16; + } + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_mmx(int len){ + struct v27 *vp; + int polys[2] = { V27POLYA, V27POLYB }; + + if(Init == 0){ + set_viterbi27_polynomial_mmx(polys); + } + if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27_mmx(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + struct v27 *vp = (struct v27 *)p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate &= 63; + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate>>2] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_mmx(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/libfec/viterbi27_port.c b/libfec/viterbi27_port.c new file mode 100644 index 0000000..7cac2b3 --- /dev/null +++ b/libfec/viterbi27_port.c @@ -0,0 +1,191 @@ +/* K=7 r=1/2 Viterbi decoder in portable C + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + + +typedef union { unsigned int w[64]; } metric_t; +typedef union { unsigned long w[2];} decision_t; +static union branchtab27 { unsigned char c[32]; } Branchtab27[2] __attribute__ ((aligned(16))); +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_port(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.w[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_port(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_port(int len){ + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA, V27POLYB }; + set_viterbi27_polynomial_port(polys); + } + if((vp = malloc(sizeof(struct v27))) == NULL) + return NULL; + if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27_port(vp,0); + + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[(endstate>>2)/32] >> ((endstate>>2)%32)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_port(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned int metric,m0,m1,decision;\ + metric = (Branchtab27[0].c[i] ^ sym0) + (Branchtab27[1].c[i] ^ sym1);\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+32] + (510 - metric);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i)&31);\ + m0 -= (metric+metric-510);\ + m1 += (metric+metric-510);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i+1)&31);\ +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits){ + struct v27 *vp = p; + void *tmp; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + unsigned char sym0,sym1; + + d->w[0] = d->w[1] = 0; + sym0 = *syms++; + sym1 = *syms++; + + BFLY(0); + BFLY(1); + BFLY(2); + BFLY(3); + BFLY(4); + BFLY(5); + BFLY(6); + BFLY(7); + BFLY(8); + BFLY(9); + BFLY(10); + BFLY(11); + BFLY(12); + BFLY(13); + BFLY(14); + BFLY(15); + BFLY(16); + BFLY(17); + BFLY(18); + BFLY(19); + BFLY(20); + BFLY(21); + BFLY(22); + BFLY(23); + BFLY(24); + BFLY(25); + BFLY(26); + BFLY(27); + BFLY(28); + BFLY(29); + BFLY(30); + BFLY(31); + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/libfec/viterbi27_sse.c b/libfec/viterbi27_sse.c new file mode 100644 index 0000000..cd1f287 --- /dev/null +++ b/libfec/viterbi27_sse.c @@ -0,0 +1,113 @@ +/* K=7 r=1/2 Viterbi decoder for SSE + * Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[64]; } metric_t; +typedef union { unsigned long w[2]; unsigned char c[8]; __m64 v[1];} decision_t; +union branchtab27 { unsigned char c[32]; __m64 v[4];} Branchtab27_sse[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in ssebfly27.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_sse(int len){ + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA, V27POLYB }; + + set_viterbi27_polynomial_sse(polys); + } + if((vp = malloc(sizeof(struct v27))) == NULL) + return NULL; + if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27(vp,0); + return vp; +} + +void set_viterbi27_polynomial_sse(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_sse(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi27_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_sse(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/libfec/viterbi27_sse2.c b/libfec/viterbi27_sse2.c new file mode 100644 index 0000000..bc01710 --- /dev/null +++ b/libfec/viterbi27_sse2.c @@ -0,0 +1,180 @@ +/* K=7 r=1/2 Viterbi decoder for SSE2 + * Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[64]; __m128i v[4]; } metric_t; +typedef union { unsigned long w[2]; unsigned char c[8]; unsigned short s[4]; __m64 v[1];} decision_t; +union branchtab27 { unsigned char c[32]; __m128i v[2];} Branchtab27_sse2[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in sse2bfly27.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_sse2(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_sse2(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_sse2(int len){ + void *p; + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA, V27POLYB }; + set_viterbi27_polynomial_sse2(polys); + } + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v27))) + return NULL; + vp = (struct v27 *)p; + + if((p = malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi27_sse2(vp,0); + + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_sse2(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +#if 0 +/* This code is turned off because it's slower than my hand-crafted assembler in sse2bfly27.s. But it does work. */ +void update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits){ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return; + d = (decision_t *)vp->dp; + while(nbits--){ + __m128i sym0v,sym1v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_epi8(syms[0]); + sym1v = _mm_set1_epi8(syms[1]); + syms += 2; + + for(i=0;i<2;i++){ + __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics */ + metric = _mm_avg_epu8(_mm_xor_si128(Branchtab27_sse2[0].v[i],sym0v),_mm_xor_si128(Branchtab27_sse2[1].v[i],sym1v)); + /* There's no packed bytes right shift in SSE2, so we use the word version and mask + * (I'm *really* starting to like Altivec...) + */ + metric = _mm_srli_epi16(metric,3); + metric = _mm_and_si128(metric,_mm_set1_epi8(31)); + m_metric = _mm_sub_epi8(_mm_set1_epi8(31),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_epi8(vp->old_metrics->v[i],metric); + m3 = _mm_add_epi8(vp->old_metrics->v[2+i],metric); + m1 = _mm_add_epi8(vp->old_metrics->v[2+i],m_metric); + m2 = _mm_add_epi8(vp->old_metrics->v[i],m_metric); + + /* Compare and select, using modulo arithmetic */ + decision0 = _mm_cmpgt_epi8(_mm_sub_epi8(m0,m1),_mm_setzero_si128()); + decision1 = _mm_cmpgt_epi8(_mm_sub_epi8(m2,m3),_mm_setzero_si128()); + survivor0 = _mm_or_si128(_mm_and_si128(decision0,m1),_mm_andnot_si128(decision0,m0)); + survivor1 = _mm_or_si128(_mm_and_si128(decision1,m3),_mm_andnot_si128(decision1,m2)); + + /* Pack each set of decisions into 16 bits */ + d->s[2*i] = _mm_movemask_epi8(_mm_unpacklo_epi8(decision0,decision1)); + d->s[2*i+1] = _mm_movemask_epi8(_mm_unpackhi_epi8(decision0,decision1)); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_epi8(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi8(survivor0,survivor1); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; +} +#endif diff --git a/libfec/viterbi29.c b/libfec/viterbi29.c new file mode 100644 index 0000000..f51e356 --- /dev/null +++ b/libfec/viterbi29.c @@ -0,0 +1,178 @@ +/* Switch to K=9 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29(int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi29_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi29_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi29_mmx(len); + case SSE: + return create_viterbi29_sse(len); + case SSE2: + return create_viterbi29_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi29_port(len); +#endif + } +} + +void set_viterbi29_polynomial(int polys[2]){ + switch(Cpu_mode){ + case PORT: + default: + set_viterbi29_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi29_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi29_polynomial_mmx(polys); + break; + case SSE: + set_viterbi29_polynomial_sse(polys); + break; + case SSE2: + set_viterbi29_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi29_polynomial_port(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi29_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi29_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi29_mmx(p,starting_state); + case SSE: + return init_viterbi29_sse(p,starting_state); + case SSE2: + return init_viterbi29_sse2(p,starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi29_port(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi29( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi29_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi29_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi29_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi29_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi29_sse2(p,data,nbits,endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi29_port(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi29_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi29_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi29_mmx(p); + break; + case SSE: + delete_viterbi29_sse(p); + break; + case SSE2: + delete_viterbi29_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi29_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi29_blk(void *p,unsigned char syms[],int nbits){ + switch(Cpu_mode){ + case PORT: + default: + return update_viterbi29_blk_port(p,syms,nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi29_blk_av(p,syms,nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi29_blk_mmx(p,syms,nbits); + case SSE: + return update_viterbi29_blk_sse(p,syms,nbits); + case SSE2: + return update_viterbi29_blk_sse2(p,syms,nbits); +#endif +#ifdef __x86_64__ + case SSE2: + return update_viterbi29_blk_port(p,syms,nbits); +#endif + } +} diff --git a/libfec/viterbi29_av.c b/libfec/viterbi29_av.c new file mode 100644 index 0000000..31c8d27 --- /dev/null +++ b/libfec/viterbi29_av.c @@ -0,0 +1,190 @@ +/* K=9 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[256]; vector bool char v[16]; } decision_t; +typedef union { unsigned char c[256]; vector unsigned char v[16]; } metric_t; + +static union branchtab29 { unsigned char c[128]; vector unsigned char v[8]; } Branchtab29[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_av(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16;i++) + vp->metrics1.v[i] = (vector unsigned char)(63); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi29_polynomial_av(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_av(int len){ + struct v29 *vp; + + if(!Init){ + int polys[2] = { V29POLYA,V29POLYB }; + set_viterbi29_polynomial_av(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29_av(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi29_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_av(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits){ + struct v29 *vp = p; + decision_t *d; + int i; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + + while(nbits--){ + vector unsigned char sym1v,sym2v; + void *tmp; + + /* All this seems necessary just to load a byte into all elements of a vector! */ + sym1v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms)); /* sym1v.0 = syms[0]; sym1v.1 = syms[1] */ + sym2v = vec_splat(sym1v,1); /* Splat syms[1] across sym2v */ + sym1v = vec_splat(sym1v,0); /* Splat syms[0] across sym1v */ + syms += 2; + + for(i=0;i<8;i++){ + vector bool char decision0,decision1; + vector unsigned char metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics */ + metric = vec_avg(vec_xor(Branchtab29[0].v[i],sym1v),vec_xor(Branchtab29[1].v[i],sym2v)); + metric = vec_sr(metric,(vector unsigned char)(3)); + m_metric = (vector unsigned char)(31) - metric; + + /* Add branch metrics to path metrics */ + m0 = vec_adds(vp->old_metrics->v[i],metric); + m3 = vec_adds(vp->old_metrics->v[8+i],metric); + m1 = vec_adds(vp->old_metrics->v[8+i],m_metric); + m2 = vec_adds(vp->old_metrics->v[i],m_metric); + + /* Compare and select first set */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Interleave and store decisions and survivors */ + d->v[2*i] = vec_mergeh(decision0,decision1); + d->v[2*i+1] = vec_mergel(decision0,decision1); + vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); + } + d++; + /* renormalize if necessary */ + if(vp->new_metrics->c[0] >= 50){ + int i; + vector unsigned char scale0,scale1; + + /* Find smallest metric and splat */ + scale0 = vp->new_metrics->v[0]; + scale1 = vp->new_metrics->v[1]; + for(i=2;i<16;i+=2){ + scale0 = vec_min(scale0,vp->new_metrics->v[i]); + scale1 = vec_min(scale1,vp->new_metrics->v[i+1]); + } + scale0 = vec_min(scale0,scale1); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,8)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,4)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,2)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,1)); + + /* Now subtract from all metrics */ + for(i=0;i<16;i++) + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale0); + } + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/libfec/viterbi29_mmx.c b/libfec/viterbi29_mmx.c new file mode 100644 index 0000000..563f40a --- /dev/null +++ b/libfec/viterbi29_mmx.c @@ -0,0 +1,118 @@ +/* K=9 r=1/2 Viterbi decoder for MMX + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { char c[256]; __m64 v[32];} decision_t; +typedef union { unsigned char c[256]; __m64 v[32];} metric_t; + +unsigned char Mettab29_1[256][128] __attribute__ ((aligned(8))); +unsigned char Mettab29_2[256][128] __attribute__ ((aligned(8))); +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in mmxbfly29.s! + */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_mmx(int len){ + struct v29 *vp; + + if(Init == 0){ + int polys[2] = {V29POLYA,V29POLYB}; + + set_viterbi29_polynomial_mmx(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29(vp,0); + return vp; +} + +void set_viterbi29_polynomial_mmx(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + int symbol; + + for(symbol = 0;symbol < 256;symbol++){ + int sym; + + sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0); + Mettab29_1[symbol][state] = (sym ? (255-symbol):symbol) / 16; + + sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0); + Mettab29_2[symbol][state] = (sym ? (255-symbol):symbol) / 16; + } + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_mmx(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi29_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + struct v29 *vp = (struct v29 *)p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->decisions; + endstate &= 255; + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_mmx(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/libfec/viterbi29_port.c b/libfec/viterbi29_port.c new file mode 100644 index 0000000..292dce8 --- /dev/null +++ b/libfec/viterbi29_port.c @@ -0,0 +1,166 @@ +/* K=9 r=1/2 Viterbi decoder in portable C + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include "fec.h" + +typedef union { unsigned int w[256]; } metric_t; +typedef union { unsigned long w[8];} decision_t; + +static union { unsigned char c[128]; } Branchtab29[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_port(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.w[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi29_polynomial_port(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_port(int len){ + struct v29 *vp; + + if(!Init){ + int polys[2] = {V29POLYA,V29POLYB}; + set_viterbi29_polynomial_port(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29_port(vp,0); + + return vp; +} + + +/* Viterbi chainback */ +int chainback_viterbi29_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_port(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned int metric,m0,m1,decision;\ + metric = (Branchtab29[0].c[i] ^ sym0) + (Branchtab29[1].c[i] ^ sym1);\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+128] + (510 - metric);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i)&31);\ + m0 -= (metric+metric-510);\ + m1 += (metric+metric-510);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i+1)&31);\ +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ + +int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits){ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + while(nbits--){ + void *tmp; + unsigned char sym0,sym1; + int i; + + for(i=0;i<8;i++) + d->w[i] = 0; + sym0 = *syms++; + sym1 = *syms++; + + for(i=0;i<128;i++) + BFLY(i); + + d++; + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/libfec/viterbi29_sse.c b/libfec/viterbi29_sse.c new file mode 100644 index 0000000..4a92e5f --- /dev/null +++ b/libfec/viterbi29_sse.c @@ -0,0 +1,114 @@ +/* K=9 r=1/2 Viterbi decoder for SSE + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char w[256]; __m64 v[32];} metric_t; +typedef union { unsigned long w[8]; unsigned char c[32]; __m64 v[4];} decision_t; + +union branchtab29 { unsigned char c[128]; } Branchtab29_sse[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s! + */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_sse(int len){ + struct v29 *vp; + + if(!Init){ + int polys[2] = { V29POLYA,V29POLYB }; + + set_viterbi29_polynomial_sse(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29(vp,0); + return vp; +} + +void set_viterbi29_polynomial_sse(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_sse(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.w[i] = 200; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi29_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_sse(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/libfec/viterbi29_sse2.c b/libfec/viterbi29_sse2.c new file mode 100644 index 0000000..4c7336c --- /dev/null +++ b/libfec/viterbi29_sse2.c @@ -0,0 +1,119 @@ +/* K=9 r=1/2 Viterbi decoder for SSE2 + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[256]; __m128i v[16];} metric_t; +typedef union { unsigned long w[8]; unsigned char c[32];} decision_t; + +union branchtab29 { unsigned char c[128]; } Branchtab29_sse2[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in sse2bfly29.s! + */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_sse2(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + for(i=0;i<256;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi29_polynomial_sse2(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_sse2(int len){ + void *p; + struct v29 *vp; + + if(!Init){ + int polys[2] = {V29POLYA,V29POLYB}; + + set_viterbi29_polynomial(polys); + } + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v29))) + return NULL; + vp = (struct v29 *)p; + if((p = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi29_sse2(vp,0); + return vp; +} + + +/* Viterbi chainback */ +int chainback_viterbi29_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_sse2(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/libfec/viterbi39.c b/libfec/viterbi39.c new file mode 100644 index 0000000..d2e65f4 --- /dev/null +++ b/libfec/viterbi39.c @@ -0,0 +1,179 @@ +/* Switch to K=9 r=1/3 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Aug 2006, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39(int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi39_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi39_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi39_mmx(len); + case SSE: + return create_viterbi39_sse(len); + case SSE2: + return create_viterbi39_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi39_port(len); +#endif + } +} + +void set_viterbi39_polynomial(int polys[3]){ + switch(Cpu_mode){ + case PORT: + default: + set_viterbi39_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi39_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi39_polynomial_mmx(polys); + break; + case SSE: + set_viterbi39_polynomial_sse(polys); + break; + case SSE2: + set_viterbi39_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi39_polynomial_port(polys); + break; +#endif + } +} + + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi39_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi39_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi39_mmx(p,starting_state); + case SSE: + return init_viterbi39_sse(p,starting_state); + case SSE2: + return init_viterbi39_sse2(p,starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi39_port(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi39( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi39_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi39_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi39_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi39_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi39_sse2(p,data,nbits,endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi39_port(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi39_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi39_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi39_mmx(p); + break; + case SSE: + delete_viterbi39_sse(p); + break; + case SSE2: + delete_viterbi39_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi39_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi39_blk(void *p,unsigned char syms[],int nbits){ + switch(Cpu_mode){ + case PORT: + default: + return update_viterbi39_blk_port(p,syms,nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi39_blk_av(p,syms,nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi39_blk_mmx(p,syms,nbits); + case SSE: + return update_viterbi39_blk_sse(p,syms,nbits); + case SSE2: + return update_viterbi39_blk_sse2(p,syms,nbits); +#endif +#ifdef __x86_64__ + case SSE2: + return update_viterbi39_blk_port(p,syms,nbits); +#endif + } +} diff --git a/libfec/viterbi39_av.c b/libfec/viterbi39_av.c new file mode 100644 index 0000000..2deed51 --- /dev/null +++ b/libfec/viterbi39_av.c @@ -0,0 +1,251 @@ +/* K=9 r=1/3 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions + * 8-bit offset-binary soft decision samples + * Copyright Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[2][16]; vector unsigned char v[2]; } decision_t; +typedef union { unsigned short s[256]; vector unsigned short v[32]; } metric_t; + +static union branchtab39 { unsigned short s[128]; vector unsigned short v[16];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_av(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + for(i=0;i<32;i++) + vp->metrics1.v[i] = (vector unsigned short)(1000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi39_polynomial_av(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_av(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; + + set_viterbi39_polynomial_av(polys); + } + vp = (struct v39 *)malloc(sizeof(struct v39)); + vp->decisions = malloc(sizeof(decision_t)*(len+8)); + init_viterbi39_av(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi39_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + int path_metric; + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_av(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->dp; + int path_metric = 0; + vector unsigned char decisions = (vector unsigned char)(0); + + while(nbits--){ + vector unsigned short symv,sym0v,sym1v,sym2v; + vector unsigned char s; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms)); + + symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s); /* Unsigned byte->word unpack */ + sym0v = vec_splat(symv,0); + sym1v = vec_splat(symv,1); + sym2v = vec_splat(symv,2); + syms += 3; + + for(i=0;i<16;i++){ + vector bool short decision0,decision1; + vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * the metrics are in the range 0-765 + */ + m0 = vec_add(vec_xor(Branchtab39[0].v[i],sym0v),vec_xor(Branchtab39[1].v[i],sym1v)); + m1 = vec_xor(Branchtab39[2].v[i],sym2v); + metric = vec_add(m0,m1); + m_metric = vec_sub((vector unsigned short)(765),metric); + + /* Add branch metrics to path metrics */ + m0 = vec_adds(vp->old_metrics->v[i],metric); + m3 = vec_adds(vp->old_metrics->v[16+i],metric); + m1 = vec_adds(vp->old_metrics->v[16+i],m_metric); + m2 = vec_adds(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Store decisions and survivors. + * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in + * a funny interleaved fashion that we undo in the chainback function. + */ + decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */ + + /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting + * 0xff is equivalent to adding 1, which sets the lsb. + */ + decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1))); + + vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); + + if((i % 8) == 7){ + /* We've accumulated a total of 128 decisions, stash and start again */ + d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */ + } + } +#if 0 + /* Experimentally determine metric spread + * The results are fixed for a given code and input symbol size + */ + { + int i; + vector unsigned short min_metric; + vector unsigned short max_metric; + union { vector unsigned short v; unsigned short s[8];} t; + int minimum,maximum; + static int max_spread = 0; + + min_metric = max_metric = vp->new_metrics->v[0]; + for(i=1;i<32;i++){ + min_metric = vec_min(min_metric,vp->new_metrics->v[i]); + max_metric = vec_max(max_metric,vp->new_metrics->v[i]); + } + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2)); + + t.v = min_metric; + minimum = t.s[0]; + t.v = max_metric; + maximum = t.s[0]; + if(maximum-minimum > max_spread){ + max_spread = maximum-minimum; + printf("metric spread = %d\n",max_spread); + } + } +#endif + + /* Renormalize if necessary. This deserves some explanation. + * The maximum possible spread, found by experiment, for 8 bit symbols is about 3825 + * So by looking at one arbitrary metric we can tell if any of them have possibly saturated. + * However, this is very conservative. Large spreads occur only at very high Eb/No, where + * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor. + + * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric + * by not not normalizing when we should are extremely low. So either way, the risk to performance is small. + + * All this is borne out by experiment. + */ + if(vp->new_metrics->s[0] >= USHRT_MAX-5000){ + vector unsigned short scale; + union { vector unsigned short v; unsigned short s[8];} t; + + /* Find smallest metric and splat */ + scale = vp->new_metrics->v[0]; + for(i=1;i<32;i++) + scale = vec_min(scale,vp->new_metrics->v[i]); + + scale = vec_min(scale,vec_sld(scale,scale,8)); + scale = vec_min(scale,vec_sld(scale,scale,4)); + scale = vec_min(scale,vec_sld(scale,scale,2)); + + /* Subtract it from all metrics + * Work backwards to try to improve the cache hit ratio, assuming LRU + */ + for(i=31;i>=0;i--) + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale); + t.v = scale; + path_metric += t.s[0]; + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return path_metric; +} diff --git a/libfec/viterbi39_mmx.c b/libfec/viterbi39_mmx.c new file mode 100644 index 0000000..875391a --- /dev/null +++ b/libfec/viterbi39_mmx.c @@ -0,0 +1,185 @@ +/* K=9 r=1/3 Viterbi decoder for x86 MMX + * Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[256]; __m64 v[32];} decision_t; +typedef union { unsigned short s[256]; __m64 v[64];} metric_t; + +static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_mmx(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.s[i] = 1000; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi39_polynomial_mmx(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_mmx(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA,V39POLYB,V39POLYC }; + set_viterbi39_polynomial_mmx(polys); + } + if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL) + return NULL; + if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi39_mmx(vp,0); + return vp; +} + + + +/* Viterbi chainback */ +int chainback_viterbi39_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d; + int path_metric; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->decisions; + + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_mmx(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d; + int path_metric = 0; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + + while(nbits--){ + __m64 sym0v,sym1v,sym2v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + syms += 3; + + for(i=0;i<32;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v)); + metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0); + m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_add_pi16(vp->old_metrics->v[32+i],metric); + m1 = _mm_add_pi16(vp->old_metrics->v[32+i],m_metric); + m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select + * There's no packed min instruction in MMX, so we use modulo arithmetic + * to form the decisions and then do the select the hard way + */ + decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64()); + decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64()); + survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0)); + survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2)); + + /* Merge decisions and store as bytes */ + d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + if(vp->new_metrics->s[0] < vp->old_metrics->s[0]) + path_metric += 65536; /* Hack: wraparound probably occured */ + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return path_metric; +} diff --git a/libfec/viterbi39_port.c b/libfec/viterbi39_port.c new file mode 100644 index 0000000..5685c90 --- /dev/null +++ b/libfec/viterbi39_port.c @@ -0,0 +1,168 @@ +/* K=9 r=1/3 Viterbi decoder in portable C + * Copyright Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include "fec.h" + +typedef union { unsigned int w[256]; } metric_t; +typedef union { unsigned long w[8];} decision_t; + +static union { unsigned char c[128]; } Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_port(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.w[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi39_polynomial_port(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab39[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + Branchtab39[2].c[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_port(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = {V39POLYA,V39POLYB,V39POLYC}; + set_viterbi39_polynomial_port(polys); + } + if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi39_port(vp,0); + + return vp; +} + + +/* Viterbi chainback */ +int chainback_viterbi39_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_port(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned int metric,m0,m1,decision;\ + metric = (Branchtab39[0].c[i] ^ sym0) + (Branchtab39[1].c[i] ^ sym1) + \ + (Branchtab39[2].c[i] ^ sym2);\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+128] + (765 - metric);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i)&31);\ + m0 -= (metric+metric-765);\ + m1 += (metric+metric-765);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i+1)&31);\ +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ + +int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + while(nbits--){ + void *tmp; + unsigned char sym0,sym1,sym2; + int i; + + for(i=0;i<8;i++) + d->w[i] = 0; + sym0 = *syms++; + sym1 = *syms++; + sym2 = *syms++; + + for(i=0;i<128;i++) + BFLY(i); + + d++; + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/libfec/viterbi39_sse.c b/libfec/viterbi39_sse.c new file mode 100644 index 0000000..c2f2865 --- /dev/null +++ b/libfec/viterbi39_sse.c @@ -0,0 +1,201 @@ +/* K=9 r=1/3 Viterbi decoder for x86 SSE + * Copyright Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned long w[8]; unsigned char c[32];} decision_t; +typedef union { signed short s[256]; __m64 v[64];} metric_t; + +static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_sse(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.s[i] = (SHRT_MIN+1000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_sse(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; + + set_viterbi39_polynomial_sse(polys); + } + if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL){ + return NULL; + } + if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi39_sse(vp,0); + return vp; +} + +void set_viterbi39_polynomial_sse(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi39_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d; + int path_metric; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/ + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric - SHRT_MIN; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_sse(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d; + int path_metric = 0; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + __m64 sym0v,sym1v,sym2v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + syms += 3; + + for(i=0;i<32;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-765 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v)); + metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0); + m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_pi16(vp->old_metrics->v[32+i],metric); + m1 = _mm_adds_pi16(vp->old_metrics->v[32+i],m_metric); + m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_pi16(m0,m1); + survivor1 = _mm_min_pi16(m2,m3); + decision0 = _mm_cmpeq_pi16(survivor0,m1); + decision1 = _mm_cmpeq_pi16(survivor1,m3); + + /* Pack decisions into 8 bits and store */ + d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-255 branch metrics is 12750 + */ + if(vp->new_metrics->s[0] >= SHRT_MAX-5000){ + int i,adjust; + __m64 adjustv; + union { __m64 v; signed short w[4]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<64;i++) + adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32)); + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + path_metric += adjust; + adjustv = _mm_set1_pi16(adjust); + + for(i=0;i<64;i++) + vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return path_metric; +} diff --git a/libfec/viterbi39_sse2.c b/libfec/viterbi39_sse2.c new file mode 100644 index 0000000..f13794e --- /dev/null +++ b/libfec/viterbi39_sse2.c @@ -0,0 +1,200 @@ +/* K=15 r=1/6 Viterbi decoder for x86 SSE2 + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned long w[8]; unsigned short s[16];} decision_t; +typedef union { signed short s[256]; __m128i v[32];} metric_t; + +static union branchtab39 { unsigned short s[128]; __m128i v[16];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_sse2(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + for(i=0;i<256;i++) + vp->metrics1.s[i] = (SHRT_MIN+1000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_sse2(int len){ + void *p; + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; + + set_viterbi39_polynomial_sse2(polys); + } + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v39))) + return NULL; + + vp = (struct v39 *)p; + if((p = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi39_sse2(vp,0); + return vp; +} + +void set_viterbi39_polynomial_sse2(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi39_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + int path_metric; + + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_sse2(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->dp; + int path_metric = 0; + + while(nbits--){ + __m128i sym0v,sym1v,sym2v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_epi16(syms[0]); + sym1v = _mm_set1_epi16(syms[1]); + sym2v = _mm_set1_epi16(syms[2]); + syms += 3; + + /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */ + for(i=0;i<16;i++){ + __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-765 + */ + m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i],sym0v),_mm_xor_si128(Branchtab39[1].v[i],sym1v)); + metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i],sym2v),m0); + m_metric = _mm_sub_epi16(_mm_set1_epi16(765),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_epi16(vp->old_metrics->v[16+i],metric); + m1 = _mm_adds_epi16(vp->old_metrics->v[16+i],m_metric); + m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_epi16(m0,m1); + survivor1 = _mm_min_epi16(m2,m3); + decision0 = _mm_cmpeq_epi16(survivor0,m1); + decision1 = _mm_cmpeq_epi16(survivor1,m3); + + /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */ + d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1); + } + /* See if we need to renormalize */ + if(vp->new_metrics->s[0] >= SHRT_MAX-5000){ + int i,adjust; + __m128i adjustv; + union { __m128i v; signed short w[8]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<32;i++) + adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + path_metric += adjust; + adjustv = _mm_set1_epi16(adjust); + + /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX + * This is okay since it can't overflow anyway + */ + for(i=0;i<32;i++) + vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return path_metric; +} + + diff --git a/libfec/viterbi615.c b/libfec/viterbi615.c new file mode 100644 index 0000000..ec2fb3c --- /dev/null +++ b/libfec/viterbi615.c @@ -0,0 +1,181 @@ +/* K=15 r=1/6 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615(int len){ + + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi615_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi615_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi615_mmx(len); + case SSE: + return create_viterbi615_sse(len); + case SSE2: + return create_viterbi615_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi615_port(len); +#endif + } +} + +void set_viterbi615_polynomial(int polys[6]){ + + switch(Cpu_mode){ + case PORT: + default: + set_viterbi615_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi615_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi615_polynomial_mmx(polys); + break; + case SSE: + set_viterbi615_polynomial_sse(polys); + break; + case SSE2: + set_viterbi615_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi615_polynomial_port(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi615_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi615_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi615_mmx(p,starting_state); + case SSE: + return init_viterbi615_sse(p,starting_state); + case SSE2: + return init_viterbi615_sse2(p,starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi615_port(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi615( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi615_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi615_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi615_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi615_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi615_sse2(p,data,nbits,endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi615_port(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi615_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi615_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi615_mmx(p); + break; + case SSE: + delete_viterbi615_sse(p); + break; + case SSE2: + delete_viterbi615_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi615_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi615_blk(void *p,unsigned char syms[],int nbits){ + switch(Cpu_mode){ + case PORT: + default: + return update_viterbi615_blk_port(p,syms,nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi615_blk_av(p,syms,nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi615_blk_mmx(p,syms,nbits); + case SSE: + return update_viterbi615_blk_sse(p,syms,nbits); + case SSE2: + return update_viterbi615_blk_sse2(p,syms,nbits); +#endif +#ifdef __x86_64__ + case SSE2: + return update_viterbi615_blk_port(p,syms,nbits); +#endif + } +} + diff --git a/libfec/viterbi615_av.c b/libfec/viterbi615_av.c new file mode 100644 index 0000000..4a6ce9c --- /dev/null +++ b/libfec/viterbi615_av.c @@ -0,0 +1,257 @@ +/* K=15 r=1/6 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions + * 8-bit offset-binary soft decision samples + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[128][16]; vector unsigned char v[128]; } decision_t; +typedef union { unsigned short s[16384]; vector unsigned short v[2048]; } metric_t; + +static union branchtab615 { unsigned short s[8192]; vector unsigned short v[1024];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_av(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + + for(i=0;i<2048;i++) + vp->metrics1.v[i] = (vector unsigned short)(5000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_av(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_av(polys); + } + vp = (struct v615 *)malloc(sizeof(struct v615)); + vp->decisions = malloc(sizeof(decision_t)*(len+14)); + init_viterbi615_av(vp,0); + return vp; +} + +void set_viterbi615_polynomial_av(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + + +/* Viterbi chainback */ +int chainback_viterbi615_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + int path_metric; + + endstate %= 16384; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_av(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->dp; + int path_metric = 0; + vector unsigned char decisions = (vector unsigned char)(0); + + while(nbits--){ + vector unsigned short symv,sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + vector unsigned char s; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms)); + + symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s); /* Unsigned byte->word unpack */ + sym0v = vec_splat(symv,0); + sym1v = vec_splat(symv,1); + sym2v = vec_splat(symv,2); + sym3v = vec_splat(symv,3); + sym4v = vec_splat(symv,4); + sym5v = vec_splat(symv,5); + syms += 6; + + for(i=0;i<1024;i++){ + vector bool short decision0,decision1; + vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = vec_add(vec_xor(Branchtab615[0].v[i],sym0v),vec_xor(Branchtab615[1].v[i],sym1v)); + m1 = vec_add(vec_xor(Branchtab615[2].v[i],sym2v),vec_xor(Branchtab615[3].v[i],sym3v)); + m2 = vec_add(vec_xor(Branchtab615[4].v[i],sym4v),vec_xor(Branchtab615[5].v[i],sym5v)); + metric = vec_add(m0,m1); + metric = vec_add(metric,m2); + m_metric = vec_sub((vector unsigned short)(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = vec_adds(vp->old_metrics->v[i],metric); + m3 = vec_adds(vp->old_metrics->v[1024+i],metric); + m1 = vec_adds(vp->old_metrics->v[1024+i],m_metric); + m2 = vec_adds(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Store decisions and survivors. + * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in + * a funny interleaved fashion that we undo in the chainback function. + */ + decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */ + + /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting + * 0xff is equivalent to adding 1, which sets the lsb. + */ + decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1))); + + vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); + + if((i % 8) == 7){ + /* We've accumulated a total of 128 decisions, stash and start again */ + d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */ + } + } +#if 0 + /* Experimentally determine metric spread + * The results are fixed for a given code and input symbol size + */ + { + int i; + vector unsigned short min_metric; + vector unsigned short max_metric; + union { vector unsigned short v; unsigned short s[8];} t; + int minimum,maximum; + static int max_spread = 0; + + min_metric = max_metric = vp->new_metrics->v[0]; + for(i=1;i<2048;i++){ + min_metric = vec_min(min_metric,vp->new_metrics->v[i]); + max_metric = vec_max(max_metric,vp->new_metrics->v[i]); + } + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2)); + + t.v = min_metric; + minimum = t.s[0]; + t.v = max_metric; + maximum = t.s[0]; + if(maximum-minimum > max_spread){ + max_spread = maximum-minimum; + printf("metric spread = %d\n",max_spread); + } + } +#endif + + /* Renormalize if necessary. This deserves some explanation. + + * The maximum possible spread, found by experiment, for 4-bit symbols is 405; for 8 bit symbols, it's 12750. + * So by looking at one arbitrary metric we can tell if any of them have possibly saturated. + * However, this is very conservative. Large spreads occur only at very high Eb/No, where + * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor. + + * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric + * by not not normalizing when we should are extremely low. So either way, the risk to performance is small. + + * All this is borne out by experiment. + */ + if(vp->new_metrics->s[0] >= USHRT_MAX-12750){ + vector unsigned short scale; + union { vector unsigned short v; unsigned short s[8];} t; + + /* Find smallest metric and splat */ + scale = vp->new_metrics->v[0]; + for(i=1;i<2048;i++) + scale = vec_min(scale,vp->new_metrics->v[i]); + + scale = vec_min(scale,vec_sld(scale,scale,8)); + scale = vec_min(scale,vec_sld(scale,scale,4)); + scale = vec_min(scale,vec_sld(scale,scale,2)); + + /* Subtract it from all metrics + * Work backwards to try to improve the cache hit ratio, assuming LRU + */ + for(i=2047;i>=0;i--) + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale); + t.v = scale; + path_metric += t.s[0]; + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return path_metric; +} diff --git a/libfec/viterbi615_mmx.c b/libfec/viterbi615_mmx.c new file mode 100644 index 0000000..89a56f7 --- /dev/null +++ b/libfec/viterbi615_mmx.c @@ -0,0 +1,183 @@ +/* K=15 r=1/6 Viterbi decoder for x86 MMX + * Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[16384]; __m64 v[2048];} decision_t; +typedef union { unsigned short s[16384]; __m64 v[4096];} metric_t; + +static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_mmx(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.s[i] = 5000; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_mmx(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_mmx(polys); + } + + if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL) + return NULL; + if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi615_mmx(vp,0); + return vp; +} + +void set_viterbi615_polynomial_mmx(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->decisions; + + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_mmx(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + + while(nbits--){ + __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + sym3v = _mm_set1_pi16(syms[3]); + sym4v = _mm_set1_pi16(syms[4]); + sym5v = _mm_set1_pi16(syms[5]); + syms += 6; + + for(i=0;i<2048;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v)); + m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v)); + m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v)); + metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2)); + m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_add_pi16(vp->old_metrics->v[2048+i],metric); + m1 = _mm_add_pi16(vp->old_metrics->v[2048+i],m_metric); + m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select + * There's no packed min instruction in MMX, so we use modulo arithmetic + * to form the decisions and then do the select the hard way + */ + decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64()); + decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64()); + survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0)); + survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2)); + + /* Merge decisions and store as bytes */ + d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return 0; +} diff --git a/libfec/viterbi615_port.c b/libfec/viterbi615_port.c new file mode 100644 index 0000000..89bdd80 --- /dev/null +++ b/libfec/viterbi615_port.c @@ -0,0 +1,156 @@ +/* K=15 r=1/6 Viterbi decoder in portable C + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t; +typedef union { unsigned long w[16384]; } metric_t; + +static union branchtab615 { unsigned long w[8192]; } Branchtab615[6] __attribute__ ((aligned(16))); +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_port(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_port(polys); + } + if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL) + return NULL; + if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi615(vp,0); + return vp; +} + +void set_viterbi615_polynomial_port(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].w[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_port(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.w[i] = 1000; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 16383] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi615_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_port(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned long metric,m0,m1,m2,m3,decision0,decision1;\ + metric = ((Branchtab615[0].w[i] ^ syms[0]) + (Branchtab615[1].w[i] ^ syms[1])\ + +(Branchtab615[2].w[i] ^ syms[2]) + (Branchtab615[3].w[i] ^ syms[3])\ + +(Branchtab615[4].w[i] ^ syms[4]) + (Branchtab615[5].w[i] ^ syms[5]));\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+8192] + (1530 - metric);\ + m2 = vp->old_metrics->w[i] + (1530-metric);\ + m3 = vp->old_metrics->w[i+8192] + metric;\ + decision0 = (signed long)(m0-m1) >= 0;\ + decision1 = (signed long)(m2-m3) >= 0;\ + vp->new_metrics->w[2*i] = decision0 ? m1 : m0;\ + vp->new_metrics->w[2*i+1] = decision1 ? m3 : m2;\ + d->c[i/4] |= ((decision0|(decision1<<1)) << ((2*i)&7));\ +} +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ + +int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + void *tmp; + decision_t *d; + int i; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + memset(d,0,sizeof(decision_t)); + for(i=0;i<8192;i++) + BFLY(i); + + syms += 6; + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} + diff --git a/libfec/viterbi615_sse.c b/libfec/viterbi615_sse.c new file mode 100644 index 0000000..de0f8af --- /dev/null +++ b/libfec/viterbi615_sse.c @@ -0,0 +1,201 @@ +/* K=15 r=1/6 Viterbi decoder for x86 SSE + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t; +typedef union { signed short s[16384]; __m64 v[4096];} metric_t; + +static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_sse(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.s[i] = (SHRT_MIN+5000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_sse(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_sse(polys); + } + + if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL){ + return NULL; + } + if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi615_sse(vp,0); + return vp; +} + +void set_viterbi615_polynomial_sse(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/ + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_sse(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + sym3v = _mm_set1_pi16(syms[3]); + sym4v = _mm_set1_pi16(syms[4]); + sym5v = _mm_set1_pi16(syms[5]); + syms += 6; + + for(i=0;i<2048;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v)); + m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v)); + m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v)); + metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2)); + m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_pi16(vp->old_metrics->v[2048+i],metric); + m1 = _mm_adds_pi16(vp->old_metrics->v[2048+i],m_metric); + m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_pi16(m0,m1); + survivor1 = _mm_min_pi16(m2,m3); + decision0 = _mm_cmpeq_pi16(survivor0,m1); + decision1 = _mm_cmpeq_pi16(survivor1,m3); + + /* Pack decisions into 8 bits and store */ + d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-255 branch metrics is 12750 + */ + if(vp->new_metrics->s[0] >= SHRT_MAX-12750){ + int i,adjust; + __m64 adjustv; + union { __m64 v; signed short w[4]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<4096;i++) + adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32)); + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + adjustv = _mm_set1_pi16(adjust); + + for(i=0;i<4096;i++) + vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return 0; +} diff --git a/libfec/viterbi615_sse2.c b/libfec/viterbi615_sse2.c new file mode 100644 index 0000000..7f711e5 --- /dev/null +++ b/libfec/viterbi615_sse2.c @@ -0,0 +1,204 @@ +/* K=15 r=1/6 Viterbi decoder for x86 SSE2 + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned long w[512]; unsigned short s[1024];} decision_t; +typedef union { signed short s[16384]; __m128i v[2048];} metric_t; + +static union branchtab615 { unsigned short s[8192]; __m128i v[1024];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_sse2(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.s[i] = (SHRT_MIN+5000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_sse2(int len){ + void *p; + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_sse2(polys); + } + + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v615))) + return NULL; + + vp = (struct v615 *)p; + if((p = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi615_sse2(vp,0); + return vp; +} + +void set_viterbi615_polynomial_sse2(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_sse2(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->dp; + + while(nbits--){ + __m128i sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_epi16(syms[0]); + sym1v = _mm_set1_epi16(syms[1]); + sym2v = _mm_set1_epi16(syms[2]); + sym3v = _mm_set1_epi16(syms[3]); + sym4v = _mm_set1_epi16(syms[4]); + sym5v = _mm_set1_epi16(syms[5]); + syms += 6; + + /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */ + for(i=0;i<1024;i++){ + __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_epi16(_mm_xor_si128(Branchtab615[0].v[i],sym0v),_mm_xor_si128(Branchtab615[1].v[i],sym1v)); + m1 = _mm_add_epi16(_mm_xor_si128(Branchtab615[2].v[i],sym2v),_mm_xor_si128(Branchtab615[3].v[i],sym3v)); + m2 = _mm_add_epi16(_mm_xor_si128(Branchtab615[4].v[i],sym4v),_mm_xor_si128(Branchtab615[5].v[i],sym5v)); + metric = _mm_add_epi16(m0,_mm_add_epi16(m1,m2)); + m_metric = _mm_sub_epi16(_mm_set1_epi16(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_epi16(vp->old_metrics->v[1024+i],metric); + m1 = _mm_adds_epi16(vp->old_metrics->v[1024+i],m_metric); + m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_epi16(m0,m1); + survivor1 = _mm_min_epi16(m2,m3); + decision0 = _mm_cmpeq_epi16(survivor0,m1); + decision1 = _mm_cmpeq_epi16(survivor1,m3); + + /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */ + d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-90 branch metrics is 405 + */ + if(vp->new_metrics->s[0] >= SHRT_MAX-12750){ + int i,adjust; + __m128i adjustv; + union { __m128i v; signed short w[8]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<2048;i++) + adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + adjustv = _mm_set1_epi16(adjust); + + /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX + * This is okay since it can't overflow anyway + */ + for(i=0;i<2048;i++) + vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} + + diff --git a/libfec/vtest27.c b/libfec/vtest27.c new file mode 100644 index 0000000..7256483 --- /dev/null +++ b/libfec/vtest27.c @@ -0,0 +1,184 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./2.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10000,errcnt,framebits=2048; + long long int tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*2*(MAXBYTES+6)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi27(framebits)) == NULL){ + printf("create_viterbi27 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), + badframes,tr+1,(double)badframes/(tr+1)); + else + printf("\n"); + + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi27(vp,0); + + /* Decode block */ + update_viterbi27_blk(vp,symbols,framebits); + + /* Do Viterbi chainback */ + chainback_viterbi27(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +} diff --git a/libfec/vtest29.c b/libfec/vtest29.c new file mode 100644 index 0000000..8471b54 --- /dev/null +++ b/libfec/vtest29.c @@ -0,0 +1,185 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./2.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10000,errcnt,framebits=2048; + long long tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*2*(MAXBYTES+8)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi29(framebits)) == NULL){ + printf("create_viterbi29 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), + badframes,tr+1,(double)badframes/(tr+1)); + else + printf("\n"); + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi29(vp,0); + + /* Decode block */ + update_viterbi29_blk(vp,symbols,framebits); + + /* Do Viterbi chainback */ + chainback_viterbi29(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +} + + diff --git a/libfec/vtest39.c b/libfec/vtest39.c new file mode 100644 index 0000000..76723b2 --- /dev/null +++ b/libfec/vtest39.c @@ -0,0 +1,186 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./3.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10000,errcnt,framebits=2048; + long long tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*3*(MAXBYTES+8)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi39(framebits)) == NULL){ + printf("create_viterbi39 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), + badframes,tr+1,(double)badframes/(tr+1)); + else + printf("\n"); + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi39(vp,0); + + /* Decode block */ + update_viterbi39_blk(vp,symbols,framebits); + + /* Do Viterbi chainback */ + chainback_viterbi39(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +} + + diff --git a/libfec/vtest615.c b/libfec/vtest615.c new file mode 100644 index 0000000..4bd8c4f --- /dev/null +++ b/libfec/vtest615.c @@ -0,0 +1,191 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./6.) +#define MAXBYTES 10000 +#define OFFSET (127.5) +#define CLIP 255 + +double Gain = 24.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10,errcnt,framebits=2048; + int tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*6*(MAXBYTES+14)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi615(framebits)) == NULL){ + printf("create_viterbi615 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %d/%d (%.3g) FER %d/%d (%.3g)\n", + tot_errs,framebits*(tr+1),tot_errs/((double)framebits*(tr+1)), + badframes,(tr+1),(double)badframes/(tr+1)); + else + printf("\n"); + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi615(vp,0); + + /* Decode block */ + update_viterbi615_blk(vp,symbols,framebits+14); + + /* Do Viterbi chainback */ + chainback_viterbi615(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +}