From f33f46edb988e393f00ce9fe91a877c4f59e9149 Mon Sep 17 00:00:00 2001 From: Manolis Surligas Date: Wed, 31 Jan 2018 19:18:45 +0200 Subject: [PATCH] Add libfec as external project The CMake build system will first try to locate libfec in the system. if it is not available, it will start building the libfec that is part of the gr-satnogs source code. During the installation libfec is installed in the system, thus future builds will make use of it and will not rebuild from source. --- CMakeLists.txt | 40 +- .../debug_afsk_transceiver_osmocom.py | 310 ++++ apps/flowgraphs/device_args_handler.py | 5 + apps/flowgraphs/satellites/mpla.ogg | Bin 0 -> 9044 bytes cmake/Modules/FindFec.cmake | 2 +- lib/CMakeLists.txt | 3 + libfec/CMakeLists.txt | 323 ++++ libfec/INSTALL | 51 + libfec/LICENSE | 502 ++++++ libfec/README | 125 ++ libfec/README.x86-64 | 13 + libfec/bootstrap | 6 + libfec/ccsds.h | 5 + libfec/char.h | 24 + libfec/cmake/Modules/Version.cmake | 115 ++ libfec/cmake/cmake_uninstall.cmake.in | 21 + libfec/config.guess | 1516 +++++++++++++++++ libfec/config.sub | 1362 +++++++++++++++ libfec/configure.in | 90 + libfec/cpu_features.s | 15 + libfec/cpu_mode_generic.c | 13 + libfec/cpu_mode_ppc.c | 40 + libfec/cpu_mode_x86.c | 33 + libfec/cpu_mode_x86_64.c | 27 + libfec/decode_rs.c | 262 +++ libfec/decode_rs.h | 298 ++++ libfec/decode_rs_8.c | 24 + libfec/decode_rs_ccsds.c | 26 + libfec/decode_rs_char.c | 22 + libfec/decode_rs_int.c | 22 + libfec/dotprod.c | 111 ++ libfec/dotprod.h | 15 + libfec/dotprod_av.c | 93 + libfec/dotprod_mmx.c | 81 + libfec/dotprod_mmx_assist.s | 83 + libfec/dotprod_port.c | 58 + libfec/dotprod_sse2.c | 72 + libfec/dotprod_sse2_assist.s | 85 + libfec/dsp.3 | 63 + libfec/dtest.c | 99 ++ libfec/encode_rs.c | 52 + libfec/encode_rs.h | 58 + libfec/encode_rs_8.c | 117 ++ libfec/encode_rs_av.c | 61 + libfec/encode_rs_ccsds.c | 24 + libfec/encode_rs_char.c | 15 + libfec/encode_rs_int.c | 15 + libfec/exercise.c | 122 ++ libfec/fec.c | 66 + libfec/fec.h | 355 ++++ libfec/fixed.h | 33 + libfec/gen_ccsds.c | 39 + libfec/gen_ccsds_tal.c | 53 + libfec/init_rs.c | 39 + libfec/init_rs.h | 106 ++ libfec/init_rs_char.c | 35 + libfec/init_rs_int.c | 35 + libfec/install-sh | 251 +++ libfec/int.h | 22 + libfec/lesser.txt | 504 ++++++ libfec/libfec.pc.in | 13 + libfec/makefile.in | 249 +++ libfec/mmxbfly27.s | 148 ++ libfec/mmxbfly29.s | 161 ++ libfec/peak_mmx_assist.s | 70 + libfec/peak_sse2_assist.s | 51 + libfec/peak_sse_assist.s | 49 + libfec/peaktest.c | 38 + libfec/peakval.c | 50 + libfec/peakval_av.c | 61 + libfec/peakval_mmx.c | 34 + libfec/peakval_mmx_assist.s | 70 + libfec/peakval_port.c | 16 + libfec/peakval_sse.c | 35 + libfec/peakval_sse2.c | 34 + libfec/peakval_sse2_assist.s | 51 + libfec/peakval_sse_assist.s | 49 + libfec/rs-common.h | 26 + libfec/rs.3 | 198 +++ libfec/rs_speedtest.c | 54 + libfec/rstest.c | 296 ++++ libfec/sim.c | 43 + libfec/simd-viterbi.3 | 247 +++ libfec/sqtest.c | 42 + libfec/sse2bfly27-64.s | 210 +++ libfec/sse2bfly27.s | 202 +++ libfec/sse2bfly29-64.s | 254 +++ libfec/sse2bfly29.s | 245 +++ libfec/ssebfly27.s | 205 +++ libfec/ssebfly29.s | 271 +++ libfec/sumsq.c | 50 + libfec/sumsq_av.c | 78 + libfec/sumsq_mmx.c | 35 + libfec/sumsq_mmx_assist.s | 83 + libfec/sumsq_port.c | 16 + libfec/sumsq_sse2.c | 33 + libfec/sumsq_sse2_assist.s | 49 + libfec/sumsq_test.c | 101 ++ libfec/viterbi27.c | 188 ++ libfec/viterbi27_av.c | 210 +++ libfec/viterbi27_mmx.c | 115 ++ libfec/viterbi27_port.c | 191 +++ libfec/viterbi27_sse.c | 113 ++ libfec/viterbi27_sse2.c | 180 ++ libfec/viterbi29.c | 178 ++ libfec/viterbi29_av.c | 190 +++ libfec/viterbi29_mmx.c | 118 ++ libfec/viterbi29_port.c | 166 ++ libfec/viterbi29_sse.c | 114 ++ libfec/viterbi29_sse2.c | 119 ++ libfec/viterbi39.c | 179 ++ libfec/viterbi39_av.c | 251 +++ libfec/viterbi39_mmx.c | 185 ++ libfec/viterbi39_port.c | 168 ++ libfec/viterbi39_sse.c | 201 +++ libfec/viterbi39_sse2.c | 200 +++ libfec/viterbi615.c | 181 ++ libfec/viterbi615_av.c | 257 +++ libfec/viterbi615_mmx.c | 183 ++ libfec/viterbi615_port.c | 156 ++ libfec/viterbi615_sse.c | 201 +++ libfec/viterbi615_sse2.c | 204 +++ libfec/vtest27.c | 184 ++ libfec/vtest29.c | 185 ++ libfec/vtest39.c | 186 ++ libfec/vtest615.c | 191 +++ 126 files changed, 16966 insertions(+), 2 deletions(-) create mode 100755 apps/flowgraphs/debug_afsk_transceiver_osmocom.py create mode 100644 apps/flowgraphs/device_args_handler.py create mode 100644 apps/flowgraphs/satellites/mpla.ogg create mode 100644 libfec/CMakeLists.txt create mode 100644 libfec/INSTALL create mode 100644 libfec/LICENSE create mode 100644 libfec/README create mode 100644 libfec/README.x86-64 create mode 100755 libfec/bootstrap create mode 100644 libfec/ccsds.h create mode 100644 libfec/char.h create mode 100644 libfec/cmake/Modules/Version.cmake create mode 100644 libfec/cmake/cmake_uninstall.cmake.in create mode 100644 libfec/config.guess create mode 100755 libfec/config.sub create mode 100644 libfec/configure.in create mode 100644 libfec/cpu_features.s create mode 100644 libfec/cpu_mode_generic.c create mode 100644 libfec/cpu_mode_ppc.c create mode 100644 libfec/cpu_mode_x86.c create mode 100644 libfec/cpu_mode_x86_64.c create mode 100644 libfec/decode_rs.c create mode 100644 libfec/decode_rs.h create mode 100644 libfec/decode_rs_8.c create mode 100644 libfec/decode_rs_ccsds.c create mode 100644 libfec/decode_rs_char.c create mode 100644 libfec/decode_rs_int.c create mode 100644 libfec/dotprod.c create mode 100644 libfec/dotprod.h create mode 100644 libfec/dotprod_av.c create mode 100644 libfec/dotprod_mmx.c create mode 100644 libfec/dotprod_mmx_assist.s create mode 100644 libfec/dotprod_port.c create mode 100644 libfec/dotprod_sse2.c create mode 100644 libfec/dotprod_sse2_assist.s create mode 100644 libfec/dsp.3 create mode 100644 libfec/dtest.c create mode 100644 libfec/encode_rs.c create mode 100644 libfec/encode_rs.h create mode 100644 libfec/encode_rs_8.c create mode 100644 libfec/encode_rs_av.c create mode 100644 libfec/encode_rs_ccsds.c create mode 100644 libfec/encode_rs_char.c create mode 100644 libfec/encode_rs_int.c create mode 100644 libfec/exercise.c create mode 100644 libfec/fec.c create mode 100644 libfec/fec.h create mode 100644 libfec/fixed.h create mode 100644 libfec/gen_ccsds.c create mode 100644 libfec/gen_ccsds_tal.c create mode 100644 libfec/init_rs.c create mode 100644 libfec/init_rs.h create mode 100644 libfec/init_rs_char.c create mode 100644 libfec/init_rs_int.c create mode 100755 libfec/install-sh create mode 100644 libfec/int.h create mode 100644 libfec/lesser.txt create mode 100644 libfec/libfec.pc.in create mode 100644 libfec/makefile.in create mode 100644 libfec/mmxbfly27.s create mode 100644 libfec/mmxbfly29.s create mode 100644 libfec/peak_mmx_assist.s create mode 100644 libfec/peak_sse2_assist.s create mode 100644 libfec/peak_sse_assist.s create mode 100644 libfec/peaktest.c create mode 100644 libfec/peakval.c create mode 100644 libfec/peakval_av.c create mode 100644 libfec/peakval_mmx.c create mode 100644 libfec/peakval_mmx_assist.s create mode 100644 libfec/peakval_port.c create mode 100644 libfec/peakval_sse.c create mode 100644 libfec/peakval_sse2.c create mode 100644 libfec/peakval_sse2_assist.s create mode 100644 libfec/peakval_sse_assist.s create mode 100644 libfec/rs-common.h create mode 100644 libfec/rs.3 create mode 100644 libfec/rs_speedtest.c create mode 100644 libfec/rstest.c create mode 100644 libfec/sim.c create mode 100644 libfec/simd-viterbi.3 create mode 100644 libfec/sqtest.c create mode 100644 libfec/sse2bfly27-64.s create mode 100644 libfec/sse2bfly27.s create mode 100644 libfec/sse2bfly29-64.s create mode 100644 libfec/sse2bfly29.s create mode 100644 libfec/ssebfly27.s create mode 100644 libfec/ssebfly29.s create mode 100644 libfec/sumsq.c create mode 100644 libfec/sumsq_av.c create mode 100644 libfec/sumsq_mmx.c create mode 100644 libfec/sumsq_mmx_assist.s create mode 100644 libfec/sumsq_port.c create mode 100644 libfec/sumsq_sse2.c create mode 100644 libfec/sumsq_sse2_assist.s create mode 100644 libfec/sumsq_test.c create mode 100644 libfec/viterbi27.c create mode 100644 libfec/viterbi27_av.c create mode 100644 libfec/viterbi27_mmx.c create mode 100644 libfec/viterbi27_port.c create mode 100644 libfec/viterbi27_sse.c create mode 100644 libfec/viterbi27_sse2.c create mode 100644 libfec/viterbi29.c create mode 100644 libfec/viterbi29_av.c create mode 100644 libfec/viterbi29_mmx.c create mode 100644 libfec/viterbi29_port.c create mode 100644 libfec/viterbi29_sse.c create mode 100644 libfec/viterbi29_sse2.c create mode 100644 libfec/viterbi39.c create mode 100644 libfec/viterbi39_av.c create mode 100644 libfec/viterbi39_mmx.c create mode 100644 libfec/viterbi39_port.c create mode 100644 libfec/viterbi39_sse.c create mode 100644 libfec/viterbi39_sse2.c create mode 100644 libfec/viterbi615.c create mode 100644 libfec/viterbi615_av.c create mode 100644 libfec/viterbi615_mmx.c create mode 100644 libfec/viterbi615_port.c create mode 100644 libfec/viterbi615_sse.c create mode 100644 libfec/viterbi615_sse2.c create mode 100644 libfec/vtest27.c create mode 100644 libfec/vtest29.c create mode 100644 libfec/vtest39.c create mode 100644 libfec/vtest615.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 4be403f..34ce3db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,7 +131,6 @@ find_package(Volk REQUIRED) find_package(OggVorbis REQUIRED) find_package(PNG REQUIRED) find_package(png++ REQUIRED) -find_package(Fec REQUIRED) ######################################################################## # Include or not into the module blocks for debugging @@ -150,6 +149,45 @@ if(${INCLUDE_DEBUG_BLOCKS}) endif() endif() +######################################################################## +# Search for the libfec if it is already installed in the system +# If not, install the internal one. +######################################################################## +find_package(Fec) +if(NOT FEC_FOUND) + message(WARNING "libfec is not installed. The internal libfec will be automatically build and install.") + include(ExternalProject) + ExternalProject_Add(FEC_EXTERNAL + SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/libfec + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libfec + CMAKE_ARGS "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" + "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + INSTALL_COMMAND "" + ) + + ExternalProject_Get_Property(FEC_EXTERNAL binary_dir) + add_library(fec SHARED IMPORTED) + + set_property(TARGET fec PROPERTY IMPORTED_LOCATION ${install_dir}/libfec.so) + + add_dependencies(fec FEC_EXTERNAL) + set(FEC_LIBRARIES "${binary_dir}/libfec.so") + set(FEC_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/libfec") + + # Install the header and the library in the standard places + install(FILES + "${FEC_INCLUDE_DIRS}/fec.h" + DESTINATION "include" + ) + install(FILES + ${FEC_LIBRARIES} + DESTINATION lib${LIB_SUFFIX} + ) +else() + add_library(fec INTERFACE) +endif() + # Search for GNU Radio and its components and versions. Add any # components required to the list of GR_REQUIRED_COMPONENTS (in all # caps such as FILTER or FFT) and change the version to the minimum diff --git a/apps/flowgraphs/debug_afsk_transceiver_osmocom.py b/apps/flowgraphs/debug_afsk_transceiver_osmocom.py new file mode 100755 index 0000000..a9e0892 --- /dev/null +++ b/apps/flowgraphs/debug_afsk_transceiver_osmocom.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +################################################## +# GNU Radio Python Flow Graph +# Title: Debug Afsk Transceiver Osmocom +# Generated: Mon Jun 13 20:30:12 2016 +################################################## + +if __name__ == '__main__': + import ctypes + import sys + if sys.platform.startswith('linux'): + try: + x11 = ctypes.cdll.LoadLibrary('libX11.so') + x11.XInitThreads() + except: + print "Warning: failed to XInitThreads()" + +from PyQt4 import Qt +from gnuradio import analog +from gnuradio import audio +from gnuradio import blocks +from gnuradio import eng_notation +from gnuradio import filter +from gnuradio import gr +from gnuradio import qtgui +from gnuradio.eng_option import eng_option +from gnuradio.filter import firdes +from gnuradio.qtgui import Range, RangeWidget +from optparse import OptionParser +import math +import numpy +import satnogs +import sip +import sys + + +class debug_afsk_transceiver_osmocom(gr.top_block, Qt.QWidget): + + def __init__(self): + gr.top_block.__init__(self, "Debug Afsk Transceiver Osmocom") + Qt.QWidget.__init__(self) + self.setWindowTitle("Debug Afsk Transceiver Osmocom") + try: + self.setWindowIcon(Qt.QIcon.fromTheme('gnuradio-grc')) + except: + pass + self.top_scroll_layout = Qt.QVBoxLayout() + self.setLayout(self.top_scroll_layout) + self.top_scroll = Qt.QScrollArea() + self.top_scroll.setFrameStyle(Qt.QFrame.NoFrame) + self.top_scroll_layout.addWidget(self.top_scroll) + self.top_scroll.setWidgetResizable(True) + self.top_widget = Qt.QWidget() + self.top_scroll.setWidget(self.top_widget) + self.top_layout = Qt.QVBoxLayout(self.top_widget) + self.top_grid_layout = Qt.QGridLayout() + self.top_layout.addLayout(self.top_grid_layout) + + self.settings = Qt.QSettings("GNU Radio", "debug_afsk_transceiver_osmocom") + self.restoreGeometry(self.settings.value("geometry").toByteArray()) + + ################################################## + # Variables + ################################################## + self.samples_per_symbol_tx = samples_per_symbol_tx = 4 + self.sq_wave = sq_wave = (1.0, ) * samples_per_symbol_tx + self.gaussian_taps = gaussian_taps = filter.firdes.gaussian(1.0, samples_per_symbol_tx, 1.0, 4*samples_per_symbol_tx) + self.deviation = deviation = 800 + self.baud_rate = baud_rate = 1200 + self.tx_frequency = tx_frequency = 145.835e6 + self.samp_rate_tx = samp_rate_tx = 48e3 + self.modulation_index = modulation_index = deviation / (baud_rate / 2.0) + self.interp_taps = interp_taps = numpy.convolve(numpy.array(gaussian_taps), numpy.array(sq_wave)) + self.atten = atten = 0.1 + + ################################################## + # Blocks + ################################################## + self._atten_range = Range(0, 0.9, 0.01, 0.1, 200) + self._atten_win = RangeWidget(self._atten_range, self.set_atten, "Attenuation", "counter_slider", float) + self.top_layout.addWidget(self._atten_win) + self.satnogs_upsat_fsk_frame_encoder_0 = satnogs.upsat_fsk_frame_encoder([0x33]*8, [0x7A, 0x0E], False, False, False, True, True, "ABCD", 0, "UPSAT", 0, 1024) + self.satnogs_udp_msg_source_0 = satnogs.udp_msg_source("127.0.0.1", 16886, 1500) + self.satnogs_debug_msg_source_0 = satnogs.debug_msg_source("HELLO"*4, 1, True) + self.rational_resampler_xxx_0 = filter.rational_resampler_ccc( + interpolation=10, + decimation=1, + taps=None, + fractional_bw=None, + ) + self.qtgui_time_sink_x_0_0_0 = qtgui.time_sink_c( + 1024, #size + samp_rate_tx, #samp_rate + "", #name + 1 #number of inputs + ) + self.qtgui_time_sink_x_0_0_0.set_update_time(0.10) + self.qtgui_time_sink_x_0_0_0.set_y_axis(-1, 1) + + self.qtgui_time_sink_x_0_0_0.set_y_label("Amplitude", "") + + self.qtgui_time_sink_x_0_0_0.enable_tags(-1, True) + self.qtgui_time_sink_x_0_0_0.set_trigger_mode(qtgui.TRIG_MODE_FREE, qtgui.TRIG_SLOPE_POS, 0.0, 0, 0, "") + self.qtgui_time_sink_x_0_0_0.enable_autoscale(False) + self.qtgui_time_sink_x_0_0_0.enable_grid(False) + self.qtgui_time_sink_x_0_0_0.enable_control_panel(True) + + if not True: + self.qtgui_time_sink_x_0_0_0.disable_legend() + + labels = ["", "", "", "", "", + "", "", "", "", ""] + widths = [1, 1, 1, 1, 1, + 1, 1, 1, 1, 1] + colors = ["blue", "red", "green", "black", "cyan", + "magenta", "yellow", "dark red", "dark green", "blue"] + styles = [1, 1, 1, 1, 1, + 1, 1, 1, 1, 1] + markers = [2, -1, -1, -1, -1, + -1, -1, -1, -1, -1] + alphas = [1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0] + + for i in xrange(2*1): + if len(labels[i]) == 0: + if(i % 2 == 0): + self.qtgui_time_sink_x_0_0_0.set_line_label(i, "Re{{Data {0}}}".format(i/2)) + else: + self.qtgui_time_sink_x_0_0_0.set_line_label(i, "Im{{Data {0}}}".format(i/2)) + else: + self.qtgui_time_sink_x_0_0_0.set_line_label(i, labels[i]) + self.qtgui_time_sink_x_0_0_0.set_line_width(i, widths[i]) + self.qtgui_time_sink_x_0_0_0.set_line_color(i, colors[i]) + self.qtgui_time_sink_x_0_0_0.set_line_style(i, styles[i]) + self.qtgui_time_sink_x_0_0_0.set_line_marker(i, markers[i]) + self.qtgui_time_sink_x_0_0_0.set_line_alpha(i, alphas[i]) + + self._qtgui_time_sink_x_0_0_0_win = sip.wrapinstance(self.qtgui_time_sink_x_0_0_0.pyqwidget(), Qt.QWidget) + self.top_layout.addWidget(self._qtgui_time_sink_x_0_0_0_win) + self.qtgui_time_sink_x_0_0 = qtgui.time_sink_f( + 1024, #size + samp_rate_tx, #samp_rate + "", #name + 1 #number of inputs + ) + self.qtgui_time_sink_x_0_0.set_update_time(0.10) + self.qtgui_time_sink_x_0_0.set_y_axis(-1, 1) + + self.qtgui_time_sink_x_0_0.set_y_label("Amplitude", "") + + self.qtgui_time_sink_x_0_0.enable_tags(-1, True) + self.qtgui_time_sink_x_0_0.set_trigger_mode(qtgui.TRIG_MODE_FREE, qtgui.TRIG_SLOPE_POS, 0.0, 0, 0, "") + self.qtgui_time_sink_x_0_0.enable_autoscale(False) + self.qtgui_time_sink_x_0_0.enable_grid(False) + self.qtgui_time_sink_x_0_0.enable_control_panel(True) + + if not True: + self.qtgui_time_sink_x_0_0.disable_legend() + + labels = ["", "", "", "", "", + "", "", "", "", ""] + widths = [1, 1, 1, 1, 1, + 1, 1, 1, 1, 1] + colors = ["blue", "red", "green", "black", "cyan", + "magenta", "yellow", "dark red", "dark green", "blue"] + styles = [1, 1, 1, 1, 1, + 1, 1, 1, 1, 1] + markers = [2, -1, -1, -1, -1, + -1, -1, -1, -1, -1] + alphas = [1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0] + + for i in xrange(1): + if len(labels[i]) == 0: + self.qtgui_time_sink_x_0_0.set_line_label(i, "Data {0}".format(i)) + else: + self.qtgui_time_sink_x_0_0.set_line_label(i, labels[i]) + self.qtgui_time_sink_x_0_0.set_line_width(i, widths[i]) + self.qtgui_time_sink_x_0_0.set_line_color(i, colors[i]) + self.qtgui_time_sink_x_0_0.set_line_style(i, styles[i]) + self.qtgui_time_sink_x_0_0.set_line_marker(i, markers[i]) + self.qtgui_time_sink_x_0_0.set_line_alpha(i, alphas[i]) + + self._qtgui_time_sink_x_0_0_win = sip.wrapinstance(self.qtgui_time_sink_x_0_0.pyqwidget(), Qt.QWidget) + self.top_layout.addWidget(self._qtgui_time_sink_x_0_0_win) + self.interp_fir_filter_xxx_0 = filter.interp_fir_filter_fff(samples_per_symbol_tx, (interp_taps)) + self.interp_fir_filter_xxx_0.declare_sample_delay(0) + self.blocks_vco_f_0 = blocks.vco_f(48e3, -48e3, 1.0) + self.blocks_multiply_const_vxx_0 = blocks.multiply_const_vff((atten, )) + self.audio_sink_0_0 = audio.sink(48000, "", True) + self.analog_quadrature_demod_cf_0 = analog.quadrature_demod_cf(48e3/(2*math.pi*deviation/8.0)) + self.analog_frequency_modulator_fc_0 = analog.frequency_modulator_fc((math.pi*modulation_index) / samples_per_symbol_tx) + + ################################################## + # Connections + ################################################## + self.msg_connect((self.satnogs_debug_msg_source_0, 'msg'), (self.satnogs_upsat_fsk_frame_encoder_0, 'pdu')) + self.msg_connect((self.satnogs_udp_msg_source_0, 'msg'), (self.satnogs_upsat_fsk_frame_encoder_0, 'pdu')) + self.connect((self.analog_frequency_modulator_fc_0, 0), (self.qtgui_time_sink_x_0_0_0, 0)) + self.connect((self.analog_frequency_modulator_fc_0, 0), (self.rational_resampler_xxx_0, 0)) + self.connect((self.analog_quadrature_demod_cf_0, 0), (self.blocks_multiply_const_vxx_0, 0)) + self.connect((self.blocks_multiply_const_vxx_0, 0), (self.audio_sink_0_0, 0)) + self.connect((self.blocks_vco_f_0, 0), (self.qtgui_time_sink_x_0_0, 0)) + self.connect((self.interp_fir_filter_xxx_0, 0), (self.analog_frequency_modulator_fc_0, 0)) + self.connect((self.interp_fir_filter_xxx_0, 0), (self.blocks_vco_f_0, 0)) + self.connect((self.rational_resampler_xxx_0, 0), (self.analog_quadrature_demod_cf_0, 0)) + self.connect((self.satnogs_upsat_fsk_frame_encoder_0, 0), (self.interp_fir_filter_xxx_0, 0)) + + def closeEvent(self, event): + self.settings = Qt.QSettings("GNU Radio", "debug_afsk_transceiver_osmocom") + self.settings.setValue("geometry", self.saveGeometry()) + event.accept() + + + def get_samples_per_symbol_tx(self): + return self.samples_per_symbol_tx + + def set_samples_per_symbol_tx(self, samples_per_symbol_tx): + self.samples_per_symbol_tx = samples_per_symbol_tx + self.set_gaussian_taps(filter.firdes.gaussian(1.0, self.samples_per_symbol_tx, 1.0, 4*self.samples_per_symbol_tx)) + self.set_sq_wave((1.0, ) * self.samples_per_symbol_tx) + self.analog_frequency_modulator_fc_0.set_sensitivity((math.pi*self.modulation_index) / self.samples_per_symbol_tx) + + def get_sq_wave(self): + return self.sq_wave + + def set_sq_wave(self, sq_wave): + self.sq_wave = sq_wave + self.set_interp_taps(numpy.convolve(numpy.array(self.gaussian_taps), numpy.array(self.sq_wave))) + + def get_gaussian_taps(self): + return self.gaussian_taps + + def set_gaussian_taps(self, gaussian_taps): + self.gaussian_taps = gaussian_taps + self.set_interp_taps(numpy.convolve(numpy.array(self.gaussian_taps), numpy.array(self.sq_wave))) + + def get_deviation(self): + return self.deviation + + def set_deviation(self, deviation): + self.deviation = deviation + self.set_modulation_index(self.deviation / (self.baud_rate / 2.0)) + self.analog_quadrature_demod_cf_0.set_gain(48e3/(2*math.pi*self.deviation/8.0)) + + def get_baud_rate(self): + return self.baud_rate + + def set_baud_rate(self, baud_rate): + self.baud_rate = baud_rate + self.set_modulation_index(self.deviation / (self.baud_rate / 2.0)) + + def get_tx_frequency(self): + return self.tx_frequency + + def set_tx_frequency(self, tx_frequency): + self.tx_frequency = tx_frequency + + def get_samp_rate_tx(self): + return self.samp_rate_tx + + def set_samp_rate_tx(self, samp_rate_tx): + self.samp_rate_tx = samp_rate_tx + self.qtgui_time_sink_x_0_0_0.set_samp_rate(self.samp_rate_tx) + self.qtgui_time_sink_x_0_0.set_samp_rate(self.samp_rate_tx) + + def get_modulation_index(self): + return self.modulation_index + + def set_modulation_index(self, modulation_index): + self.modulation_index = modulation_index + self.analog_frequency_modulator_fc_0.set_sensitivity((math.pi*self.modulation_index) / self.samples_per_symbol_tx) + + def get_interp_taps(self): + return self.interp_taps + + def set_interp_taps(self, interp_taps): + self.interp_taps = interp_taps + self.interp_fir_filter_xxx_0.set_taps((self.interp_taps)) + + def get_atten(self): + return self.atten + + def set_atten(self, atten): + self.atten = atten + self.blocks_multiply_const_vxx_0.set_k((self.atten, )) + + +def main(top_block_cls=debug_afsk_transceiver_osmocom, options=None): + + from distutils.version import StrictVersion + if StrictVersion(Qt.qVersion()) >= StrictVersion("4.5.0"): + style = gr.prefs().get_string('qtgui', 'style', 'raster') + Qt.QApplication.setGraphicsSystem(style) + qapp = Qt.QApplication(sys.argv) + + tb = top_block_cls() + tb.start() + tb.show() + + def quitting(): + tb.stop() + tb.wait() + qapp.connect(qapp, Qt.SIGNAL("aboutToQuit()"), quitting) + qapp.exec_() + + +if __name__ == '__main__': + main() diff --git a/apps/flowgraphs/device_args_handler.py b/apps/flowgraphs/device_args_handler.py new file mode 100644 index 0000000..346a07f --- /dev/null +++ b/apps/flowgraphs/device_args_handler.py @@ -0,0 +1,5 @@ +# this module will be imported in the into your flowgraph + +def append_dev_args(device, dev_args): + if(len(dev_args) == 0): + return 0 diff --git a/apps/flowgraphs/satellites/mpla.ogg b/apps/flowgraphs/satellites/mpla.ogg new file mode 100644 index 0000000000000000000000000000000000000000..67b83eb25f9d8ec81db938e9046bcc7c3ca36ef7 GIT binary patch literal 9044 zcmb_>c|6qL+xMr{k|bnbvyLT=?2#>D?2~nD*_R11mh1|J5ZRdu8JcW~u|}&UvhN8g zOtzFl_WR7J@BMv#&-1$Pf1c0l%$ak}b)D;+Yk6Pi%x4T-TrNWgp+66=b&?0EdGw=# z+X1cv{$4)zZWz+Qfrtju7X+QfA0mI(9xx!y{O2LfJOFy5W9e54?BJuJuWoQr1F3|u^Y!#{ z!SHywxbQf6I(Ruc`5ZX1FIrSh-ShxN37Fx!Jk1e3NhT0P2SHc(VYDgE+Av&do^W7h zsuro$d^R>SHMW_>As*iHx8jjoKEsj9Ia4K57(Tqq^Wgo+Piv?7-Iegl9i?BY_B9C)#QcJe+ zMCwY;6{P4Z%sqaTs4y>F)uVPyys8hO3m-IfC~G`w9fIkSCu09pRAfDPz+TjjvO3bK zQIf>RzzQ>cfT(@7c%TTNCZw6ns9nL>QNhvK%WtqEX2c-fdrsrBv7QNd{H)FVoreAW zhW$e=GNUbPLoI5fEk~lQ*Q4!NZv830Lnqfsg{(Ty0VtPGKK%J1m(FTJF;0KfLNA1 zN_Cq3l>kFb-ZW2kPxjAFdD`rmTs&h=|EpNz9HBaVW)9hdSG!oh0AIq|vzl|Wda^NH z+BDg?{Z=4-4n--x4~(%Av9v9$c7V^VFPj2CY7XXF^kmOIu44qP*wh>$LY?uxygy-q zx50M3lAi)IVn5`d>!_?jYr_%yt&@s0_MhRS0{mt-!qRg8CQoSMQF}(-Lf#ZE-@CLa zx$rhv0arreB?Law5XmZp^Sw#doCkuU=*Ut0w<3q~eiRqnixX)Vu6!@qAx5f7-gGG} zJbgo_P6G!~EG7-2IDs(x08=Kfg~kzvP@=e0Bm#!pM-*@>k7JVRQC2cYZe{wk!-@bZ z{_VK${M{!xd;e9se{dMl3xLuqDSr8!l8Ld2nZJichTm9CwDpMJalPW>iMgZ@sV)lQZ*p%&GD0J9I9$zi{LA%_H!)AEU5wC)}LZ{*|( zCw`Vo&#hKvB*)MDUoS?uFg+juU< z!zo##V&F%4BYRXS1)@_WDS6Y9kZ)N)_Z?4Es+u-yY7$Zl2SE`K1^67x>q$CwKm`jy zH5PW;I z=Xg?)NH=`y07_VZ00;tULnt)7aLAs9EUL6%z}_8JJ%lPDFJ3@eoJ7zT$LJ#{c+i&Ga#RG+FauAmR4jUyN$bOdliYOqqmki9ORz=qHUqEOm+!T_qYl)!SH5*AI` zRaH7F4vu4T5XO-g$AQHO)uhF%jFE1n8SbSmU~xQ95Dl*$a!-J_a5!|z;}{*>VJ!ow zc=2iuhZ5-kvgS-+R|G&1ZviGD?@AoZ&@F9dLy$#*#fMM{r3ALi2%w<5E*>1E?+FAJ z446S$gTMkrwX&ea!SVa@0L1e$O487^4FT~`gTwhKZwvMuC9Ihxs)Yx~5=jSZ&cx82 z#<5^%)mlIc@s8AjaYoUT{6dF-!N9R>_z9flS0W4FeukoNl^llD4nizZsrK*Xi9Emu zQAx;nc_K#?5*bH_jDr)gk%kCF8d4WNOB!>>BN6Zx@)+JJo-{@WBon~VkBy{LgTV+x zV6HD%4Tl?10CT~Tz}_ez5Afk|L-q?~J^|bS>JD!d5}_@COSiuR2Q9#tin;?&H%21x zfbUL43lahH-VhjT$p-5Jhs9U4pe`dw<^p3VG`P-C(1N?vr5^1|X&NK0ucIk)$sx-K zEJb5a1HXmE&`DAzV`$VUczr-16gVQ)+<0-i2rYpECV3ENNJ&^8&JZamP+*LLLC_*F z@8U}}VJSxeD2W1`UImQcQgbU6n4yJ39RaND5D#J+j1g0jK;=vfs5roR0q#A(dK?b# zfw?e=z!qSvl|&$79rjNXDhLv}1`gk+Mvj=9$+-ZY2O&H-CEy6MUD={p05>I0N((*< zHjTjVcTJoqcCY!BJYLZQ5sDC`1tVW90lG*=Cwb%VJriE| zPwT(O$gSBwg!d%p#LLfekmh1B`XKyo9D-CL5>DqZ#2-FI(vcFlI^K>P{z-^OsbmAE znX=)qk@Td91_??42O?j*`n^01mIhQ2_ooM45Tqp)AakMd1SW?BPLR#w#9P=fOTaWh z9}u4)3621kp=@;+i3KQX1*QQ)CxJ*Z2xyho6_KP2Sz(ryNKy#1*kE9WOhq|JMPNkk z4WiFt%JmxA4j&=t5)5cS`^Qsh`!L;ioCrWXIV4D3M-qO0_X%ok{~|>7kbq4l7^=Z= zUAX>fh`$7SgfK4kuP7V|mw#C3Nr8n_*bkHaJITH%a>OAKRQr77%StLeX_Nai zKmd~X^7aWx1UYU=qDb;U@bdm8fQ875lL<&QN`Zd}I3NJ*YM%fW`r9H&fuJ6;KToDH z*%z^zaoymfW)|DDeV^2{Gee|Pc!zx)4O0(#s*L-Fz}6}yYMwt^_f z@iSx$G6BTB$#h6m6#_6up)+q9s(mde=tEqskVdGIjD-X+8It_^IQ1R^QQFMp5T6x^ zM2dh63i9C)tr|j{z+`!+Vitv_$SVLDloViPjgd(Rbm!&7^Sk2*kdHo8oj;vt5v5I5qUAWg&u(<%HOKqb`8a{2bC z61jX4rQj6oQlj7u`CdmbM|VmSCn8TMP)@oRgTO{FE3H3>^n`;*?s-LGQLSY`cgwGH zMA0hz=Awbx60SiiR%h=eDReyTi(xl-Np}O}7iuU59BA<%0Y&Zm#6E5x>i+^W_>mwe z2QJ{|ugc!{JxTansK1hb;^4!clR3$}8Ax~vgX-M@!#inLZkt|gAnyG8qPNwDk^7qAHB@IrYk3OUj~9yW8Txn z#x(Q5^Voz%3J*mzAGs9*we^f>HPl9&Ib_w-Y)zp?sYZ9xoI~XRm7LtvEuo3;QS8{< z%f|Dhbx6OfKLIO@LZ|byvc7!%{u`n?$;u0Z%d2V|fpiQW9_Tknf~FBMuFkV^3W^t0 z)Gl7qLg?xnB8~rf)esR8LFQEfbwtGe%Y>KAmc6|SU7M3fd+2z!UzU#xr-{F4*d~mHy98M`g>3k3tYskYYy1>!4BYbaFOpDu zKF7uKkg@vEt82Hc-4udC?xM+1+^rpliJA0-Ri+Ky{ zZEQbfYVMvtRzMK2tpA)f5z04nR8Y#)D7B(l%!VgEsmXmxl5zRhbc=(syM%IE`0m}u zKc+NWyj<6^F*|2!exVzGO)VpXR;GsoJTLDVoy)0wtTCMGZ&Njhdc!s;Ib93W=WpCO zl^A#Aa>+ZpC9_1_p4rMo`gGb){6UKAp>FBwEHk!`>D#c>nsS9B;~~LcAVpIR+Imi= z%*~4BfP@A0&@lY%^4*}(;&AZ>+b|{1-QB$V!`zp~gHF(H2s&&VFc;3TT zyO-cGeE3yn*gB@cdm;?s?i43O+kN#LZfKihceKSh&Z3(t)36a27Fe`0v|w>@=zZO% z!ZX`4BYym+-UeE=K#n#ip~G!2x4fKQNR)1`T;JH~sKNZ$eW9n$DIbb$CffTlBjyQv z)f*Loh}F8zxrEp?XUROyJ!>VY^}8MY%@JLm1EE6KCW!prG_Xen(n=F4}ZL;sJIa% z!EF%k&^aE2zz^?!t}UzidA)KHBfVPm`PbDw>{|MG%-U*fG`H@~(mkc2SWV3H!E@A3 zW>rfm*_sd5Cij+~vzX27G%xJ+E>5^)VU(8a7r3r>RsQxTHV(IaP_SX@FB)rGSRKNc z&bXPmHxwz>!zRbN1-+=pU$Fs|sYoJ)O<37C85D;I%`CvKGdTBtrxP}q`8n8dPdls;m1N{o+q8XmoA#W zP8LkLYg8I2-PJ5JR{mmd_{R2YzpRUB*5H-LPAm7N$9$Xobm{~A4;#;L%zAC9=il4# zE`QBbH%?nulZ481o{;}Fy^d$JaX81N#Nnx4i`*V_B36f*TvXn4_ppv%IDKPod!R_Q zVKVX;Uy@F9?M*$L<}GDnp@?WY<=JeOCyNYyajqPfC3e4UooT)h*iQc=rkMKVOBb=S zvu!>F?Jo}-Y5Cf%2geur1(Z|-Ja~PrIr$Ck#Ul#p@74YWSDONwyBqq?Lpz3a@b+kDm++rc*HPPHg9Is z=XWEFu6?5TJo_t=3vc7N)_M^3>=~HN^EPU(?B$~qbsMgCUM&hJv+Wj#oU^;h?r|i$ zmZwvQDQ`gRg=BktO$0|gd{ZzmB!AK<@+mr0`ff>`e#YpI1o}qCdxbD@Fl&Old+^}(HsIuq@Fx2CkuNsn*~JWF2lecJTZ(sK4Y z9fyg&Q1qof6K?Alzp-g6{>a83ZZE_Y62=bXRDR4Ru70CNrQH4%CuU-uHn(|fB zpnK8oE%&YQ(@lK_>7&Li_VeD$7>DyQbHn+Yz3bl?BAt0WINh5wk3Pc{1<4W%ByVOh z-N+ua-;`6!6%_ncYW!+Xt}wrPj}@TF2H5Vn~tx$J|ErwQ8V#jdZDx7 z{b~l%tTK=Fd8u`wLm5wJ%jiYhoyn%E3UhIxF00@s+Lq6@SGuYvRS=saGZUqwvNb+Y zh9h6SyiRqcDXh#@i@X{!nf2ST8kpWJM$er}&@UD(A1u<*Lt4+W(kzs>ni#JRT((#m zqa2Za@jDbLA8&*-yWJm$#dlL3M<$uFBZ=Qo!R6l1y6o1p2NGjH+3n`n&3~ILbxBSN zV^{yZZpv-?l@{LHaEP_(v3@9{_SWjN%&H?|>krdwE2s@GT%c$FI%~@0>&Wkija*Cg z*nJnCkb1-;N8)qN&Wg$g_tnSwK@<6PkBhz4`+8M<+x8D_fwKTXZluD zM{w>9{g33}ba&>(|pG9d@|R9GJ@_7%G-_a?>j*I9H*_Sy~K7hKxoR7dPqO#G_O1h_sB zyT9RWoioF7_hxKokg_aS@d9)IEC0COD|=EOhb>0z;lU3nJThkuDMau>Md~G@3;c(t z9MHV!2wxyMiU^+MBF2pDnqR z&~LnrLn0mX)BVj$zZXTf&KEYykM0^(wiK4XuQcvrbnrkY_XqW^#Lq9ezRFsrP-7p@ zD(|!_Z4@2*Ao?Yo0sXpK!mEEXduUPC)-71%?fv!>#jWg~N&Qs~y8(vo6XW7>Mq!zW zRTX?623Cv!K)Z@2r5R!E(Z zbb+>y&7!NQCss3fYqx9aOu(zX3t|nStINl1v)*xk3!yJEQamp(zQtx99OQtlFuHqj zd0PzA*kYWS$|@1kRDJu@@e@zxSA)5Jzt@1pVhAaF(jbLO;io^U1 z$Kx)9MiDqn`jowQ*F*c3Vfgo+;qIOUhZ4EZnI@>Mg%{WL*1m<$B=S^R?ck{K-_}ht z@LoHaJt4T=Hy4zD&6qcAarjB_Jwyd==#5o}Om#LjS2x@>rLeHXY;!T(dP5t^kX*Ck zTI%Uv^f0F{kTtsMDsIq7;uurA2C;7M#Z=#nGeHkG-omn4IBjLig*QtNpH3)igWJ|y zJ7jw5u>JIisWsoLp?c^QIi&vl1N!R;QQ5dl*3(~m}6q_+Tj$U ze3>IOq)EBKv|LDUjD8EfA@;aa>vBpbK@Qi!c@krKftY|>2*Pi3%9bcRXXW~7EeoGs zRAkj&=vpXw9>!yLuh%s+tRFM!m_M|N4M|0O%za5d#D?D$UlItwX;-7Nc zt*YiDr|=TT=X-R=S^R}0y6uuTuCRab)5>f;_>g%a?JQ;4aixK#$}}5=>dh6kOig-& zvsvQlDww{M?Ci>u)l#9l#^}4gcRLui6oxxd^EL%rBa@=ZNVr?cM1#lKwY1-)zd}<7+XR+bPl9Ko-RVHD4-PTEf^_wG(!{o5#xu2FE z&EAA2F*l58#K=U?s@oONF%w79@op0`D1S6mjCCCW2=!517y?@A<0e~&C|5iesWD$o9gB=zNS=akL~8#dU<4= zk`VW{b!Rx^0O_QAU4G&Bw9Wa%LQl5s_k8bvf)}s9<4n8`{-HZN>6x*x?oh){c%c#I z@}ygfC}VqUj@Q1u$nAZg3=zLX)RF$k*^%;AaGCnZ@Y0CB#B56cwUvZ!SiHxAzS_0f zJHJkc_9?qs`!2*zd(|R@K;u^2IYC6qe_Qr&Y zZ~cOwq%kI)0HjaE4G))SZgWoyE7|SCsgy?&3e1) zT1oV3@9SB9qqmDSY>{uTRatyq=U(60^l~v}QO=nzDR$ieoAkG`-*aSRYI$;ilmrmW`Y#=YzL+}Jy&P-Jhfn6;lXw}$w^n}S{8161H4Of556mNt^$ z7iKkyt>JH`4rIL{8b7m}UJ5Jux!1ZEZ6?Ou=#f=FQMf|bD7jO9F_CsAGQdbrXH2$l zQ1`qx|sf&zn8Rm?BiaX*buXeEY z?r0h|eXy^7A>nn=w#iK+;)aM6{moAW%k6z#YxM>>ccOc-+htFNe6@B^Ia1>;C#K}{ z3}##d2?QcPTYu}XA487ZF$SHbjY5(5_TZKNJKcPkkDT3YuZ7~2$2OnI`g7`Q@(1>W zzzr5h>z{ozDAF`A$f(iVuI*eD_c*cmkShLB9es8BNR~bp6EY^h>A6sXQ}TZ1U+PO+ zy~f39*Zf_}p&Pn)>r!lEy`Pv@6@M z6Swpx3QImUe92jAH+*=~<)lUR`r_Tat}*d^D}p&6WBoV%rRd{s2mEZ!pO_oD!2N!g zETSyS*u%pUt`sE8jSl3`s}P6>?vB(Qea$F1_qt{5a#4)&i}0>a>0AcmYML`&mGGrh zdganmQlZTHci7%9r8Xl{gi*YAMz1n3D~H5en|w*STV;1-e*05z<D4C+ zjWVPLB*m$gCS3OdFmBApIj~NF$#ph0%VcY-! literal 0 HcmV?d00001 diff --git a/cmake/Modules/FindFec.cmake b/cmake/Modules/FindFec.cmake index d1197a8..76b4e42 100644 --- a/cmake/Modules/FindFec.cmake +++ b/cmake/Modules/FindFec.cmake @@ -22,4 +22,4 @@ FIND_LIBRARY( ) INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS(FEC DEFAULT_MSG FEC_LIBRARIES FEC_INCLUDE_DIRS) \ No newline at end of file +FIND_PACKAGE_HANDLE_STANDARD_ARGS(FEC DEFAULT_MSG FEC_LIBRARIES FEC_INCLUDE_DIRS) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 8bf9a01..e2a6153 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -81,6 +81,9 @@ if(NOT satnogs_sources) endif(NOT satnogs_sources) add_library(gnuradio-satnogs SHARED ${satnogs_sources}) + +add_dependencies(gnuradio-satnogs fec) + target_link_libraries(gnuradio-satnogs ${Boost_LIBRARIES} ${GNURADIO_ALL_LIBRARIES} diff --git a/libfec/CMakeLists.txt b/libfec/CMakeLists.txt new file mode 100644 index 0000000..684a6d0 --- /dev/null +++ b/libfec/CMakeLists.txt @@ -0,0 +1,323 @@ +######################################################################## +# Project setup +######################################################################## +cmake_minimum_required(VERSION 2.8) +project(libfec ASM C) + +option(BUILD_32BIT_ON_64BIT "Build a 32-bit library on a 64-bit system" OFF) + +# Select the release build type by default to get optimization flags +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release") + message(STATUS "Build type not specified: defaulting to release.") +endif(NOT CMAKE_BUILD_TYPE) +set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "") + +list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) + +if(NOT LIB_INSTALL_DIR) + set(LIB_INSTALL_DIR lib) +endif() + + +######################################################################## +# Version information +######################################################################## +set(VERSION_INFO_MAJOR 3) +set(VERSION_INFO_MINOR 0) +set(VERSION_INFO_PATCH 0) + +if(NOT DEFINED VERSION_INFO_EXTRA) + set(VERSION_INFO_EXTRA "git") +endif() +include(Version) + +if(NOT DEFINED VERSION) + #set(VERSION "\"${VERSION_INFO_MAJOR}.${VERSION_INFO_MINOR}.${VERSION_INFO_PATCH}\"") + set(VERSION "\"${VERSION_INFO}\"") +endif() + + +######################################################################## +# Compiler specific setup +######################################################################## +if(BUILD_32BIT_ON_64BIT) + set(CMAKE_SYSTEM_PROCESSOR "i386") + set(CMAKE_SIZEOF_VOID_P 4) + set(CMAKE_C_FLAGS -m32) + set(CMAKE_CXX_FLAGS -m32) + add_definitions(-m32) +endif() + +if((CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i686|x86|AMD64") AND (CMAKE_SIZEOF_VOID_P EQUAL 4)) + set(TARGET_ARCH "x86") +elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") AND (CMAKE_SIZEOF_VOID_P EQUAL 8)) + set(TARGET_ARCH "x64") +elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "i386") AND (CMAKE_SIZEOF_VOID_P EQUAL 8) AND (APPLE)) + # Mac is weird like that. + set(TARGET_ARCH "x64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm*") + set(TARGET_ARCH "ARM") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64le") + set(TARGET_ARCH "ppc64" "ppc64le") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") + set(TARGET_ARCH "ppc64" "ppc") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") + set(TARGET_ARCH "ppc") +endif() + + +if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_CLANGCC) + add_definitions(-Wall) + add_definitions(-Wno-unused) + + if(TARGET_ARCH MATCHES "x64") + add_definitions(-fPIC) + add_definitions(-msse2) + elseif(TARGET_ARCH MATCHES "x86") + add_definitions(-mmmx) + add_definitions(-msse) + add_definitions(-msse2) + elseif(TARGET_ARCH MATCHES "ppc|ppc64") + add_definitions(-fno-common) + add_definitions(-faltivec) + endif() + +endif() + +######################################################################## +# Find build dependencies +######################################################################## + +# libm +find_library(M_LIB m REQUIRED) + + +######################################################################## +# config.h +######################################################################## + +#add_definitions(-DHAVE_CONFIG_H) + +# Checks for includes +include(CheckIncludeFile) +check_include_file("getopt.h" HAVE_GETOPT_H) +check_include_file("stdio.h" HAVE_STDIO_H) +check_include_file("stdlib.h" HAVE_STDLIB_H) +check_include_file("memory.h" HAVE_MEMORY_H) +check_include_file("string.h" HAVE_STRING_H) + +# Checks for functions +include(CheckFunctionExists) +check_function_exists("getopt_long" HAVE_GETOPT_LONG) +check_function_exists("memset" HAVE_MEMSET) +check_function_exists("memmove" HAVE_MEMMOVE) + + +######################################################################## +# Setup apps +######################################################################## + +if(TARGET_ARCH MATCHES "x64") + list(APPEND libfec_sources + dotprod_port.c + peakval_port.c + sumsq.c + sumsq_port.c + cpu_mode_x86_64.c + ##asm + #sse2bfly27-64.s + #sse2bfly29-64.s + ) + +elseif(TARGET_ARCH MATCHES "x86") + list(APPEND libfec_sources + viterbi27_mmx.c + viterbi27_sse.c + viterbi27_sse2.c + viterbi29_mmx.c + viterbi29_sse.c + viterbi29_sse2.c + viterbi39_sse2.c + viterbi39_sse.c + viterbi39_mmx.c + viterbi615_mmx.c + viterbi615_sse.c + viterbi615_sse2.c + dotprod_mmx.c + dotprod_sse2.c + #peakval_mmx.c + #peakval_sse.c + #peakval_sse2.c + sumsq.c + sumsq_port.c + sumsq_sse2.c + sumsq_mmx.c + cpu_mode_x86.c + #asm + cpu_features.s + dotprod_mmx_assist.s + dotprod_sse2_assist.s + mmxbfly27.s + mmxbfly29.s + peak_mmx_assist.s + peak_sse2_assist.s + peak_sse_assist.s + peakval_mmx_assist.s + peakval_sse2_assist.s + peakval_sse_assist.s + sse2bfly27.s + sse2bfly29.s + ssebfly27.s + ssebfly29.s + sumsq_mmx_assist.s + sumsq_sse2_assist.s + ) + +elseif(TARGET_ARCH MATCHES "ppc|ppc64") + list(APPEND libfec_sources + viterbi27_av.c + viterbi29_av.c + viterbi39_av.c + viterbi615_av.c + encode_rs_av.c + dotprod_av.c + sumsq_av.c + peakval_av.c + cpu_mode_ppc.c + ) +else() + list(APPEND libfec_sources + cpu_mode_generic.c + ) + +endif() + + +list(APPEND libfec_sources + fec.c + sim.c + viterbi27.c + viterbi27_port.c + viterbi29.c + viterbi29_port.c + viterbi39.c + viterbi39_port.c + viterbi615.c + viterbi615_port.c + encode_rs_char.c + encode_rs_int.c + encode_rs_8.c + decode_rs_char.c + decode_rs_int.c + decode_rs_8.c + init_rs_char.c + init_rs_int.c + encode_rs_ccsds.c + decode_rs_ccsds.c + dotprod.c + dotprod_port.c + peakval.c + peakval_port.c + sumsq.c + sumsq_port.c + ccsds_tab.c + ccsds_tal.c +) + + +################################################################################ +# Generate pkg-config file +################################################################################ +foreach(inc ${LIBFEC_INCLUDE_DIR}) + list(APPEND LIBFEC_PC_CFLAGS "-I${inc}") +endforeach() + +foreach(lib ${LIBFEC_LIBRARY_DIRS}) + list(APPEND LIBFEC_PC_PRIV_LIBS "-L${lib}") +endforeach() + +set(LIBFEC_PC_PREFIX ${CMAKE_INSTALL_PREFIX}) +set(LIBFEC_PC_EXEC_PREFIX \${prefix}) +set(LIBFEC_PC_LIBDIR \${exec_prefix}/${LIB_INSTALL_DIR}) +set(LIBFEC_PC_INCLUDEDIR \${prefix}/include) +set(LIBFEC_PC_VERSION ${VERSION}) +set(LIBFEC_PC_LIBS "-lfec") + +# Use space-delimiter in the .pc file, rather than CMake's semicolon separator +string(REPLACE ";" " " LIBFEC_PC_CFLAGS "${LIBFEC_PC_CFLAGS}") +string(REPLACE ";" " " LIBFEC_PC_LIBS "${LIBFEC_PC_LIBS}") + +# Unset these to avoid hard-coded paths in a cross-environment +if(CMAKE_CROSSCOMPILING) + unset(LIBFEC_PC_CFLAGS) + unset(LIBFEC_PC_LIBS) +endif() + +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/libfec.pc.in + ${CMAKE_CURRENT_BINARY_DIR}/libfec.pc + @ONLY +) + +install( + FILES ${CMAKE_CURRENT_BINARY_DIR}/libfec.pc + DESTINATION ${LIB_INSTALL_DIR}/pkgconfig/ +) + + +######################################################################## +# Setup libraries +######################################################################## + +# generate ccsds_tab.c +add_executable(gen_ccsds gen_ccsds.c init_rs_char.c) +add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/ccsds_tab.c + COMMAND ${CMAKE_BINARY_DIR}/gen_ccsds > ccsds_tab.c + DEPENDS gen_ccsds +) + +# generate ccsds_tal.c +add_executable(gen_ccsds_tal gen_ccsds_tal.c) +add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/ccsds_tal.c + COMMAND ${CMAKE_BINARY_DIR}/gen_ccsds_tal > ccsds_tal.c + DEPENDS gen_ccsds_tal +) + +# libfec +add_library(libfec_shared SHARED ${libfec_sources}) +set_target_properties(libfec_shared PROPERTIES OUTPUT_NAME fec) +target_link_libraries(libfec_shared ${M_LIB}) + + +install(TARGETS libfec_shared + DESTINATION ${LIB_INSTALL_DIR}) +install(FILES "${PROJECT_SOURCE_DIR}/fec.h" + DESTINATION include) + + +######################################################################## +# Create uninstall target +######################################################################## +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in" + "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" + IMMEDIATE @ONLY) + +add_custom_target(uninstall + COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) + + +######################################################################## +# Print Summary +######################################################################## +message(STATUS "") +message(STATUS "##########################################################") +message(STATUS "## Building for version: ${VERSION}") +message(STATUS "## Target Architecture: ${TARGET_ARCH}") +message(STATUS "## Using install prefix: ${CMAKE_INSTALL_PREFIX}") +message(STATUS "##########################################################") +message(STATUS "") + diff --git a/libfec/INSTALL b/libfec/INSTALL new file mode 100644 index 0000000..7c003a2 --- /dev/null +++ b/libfec/INSTALL @@ -0,0 +1,51 @@ +INSTALLATION INSTRUCTIONS + +CMake-based build: + +Works on most platforms. Do + +mkdir build +cd build +cmake .. +make + + +If that fails, try the older automake-based build: + +./bootstrap +./configure +make +make test (optional) +make install (as root) + +By default, "make install" puts the libfec libraries in +/usr/local/lib, the include files in /usr/local/include, and the +manual page in /usr/local/man. + +You may have an old version of the GNU assembler that cannot handle +the relatively new SSE2 mnemonics. Update your version of the GNU +"binutils" package. + +You may obtain the latest binutils package through your normal +distribution channels or from: + +http://sources.redhat.com/binutils/ + +TESTING THE FEC LIBRARY + +After running the ./configure script, optional tests can be built and +run as follows: + +make test + +"make test" tests each routine, using the SIMD versions as +appropriate, verifying correct operation and estimating Viterbi +decoding speeds. These tests should always succeed unless something is +broken. + +28 Mar 2004 +Phil Karn, karn@ka9q.net + +3 Jan 2014 +Matthias P. Braendli, matthias@mpb.li + diff --git a/libfec/LICENSE b/libfec/LICENSE new file mode 100644 index 0000000..5a883d3 --- /dev/null +++ b/libfec/LICENSE @@ -0,0 +1,502 @@ +GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +(This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.) + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + {description} + Copyright (C) {year} {fullname} + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + {signature of Ty Coon}, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! diff --git a/libfec/README b/libfec/README new file mode 100644 index 0000000..68d043e --- /dev/null +++ b/libfec/README @@ -0,0 +1,125 @@ +COPYRIGHT + +This package is copyright 2006 by Phil Karn, KA9Q. It may be used +under the terms of the GNU Lesser General Public License (LGPL). See +the file "lesser.txt" in this package for license details. + +It has been modified by Matthias P. Braendli, HB9EGM, so that it +compiles for x86_64 and for arm. + +For installation instructions, please see INSTALL + +INTRODUCTION + +This package provides a set of functions that implement several +popular forward error correction (FEC) algorithms and several low-level routines +useful in modems implemented with digital signal processing (DSP). + +The following routines are provided: + +1. Viterbi decoders for the following convolutional codes: + +r=1/2 k=7 ("Voyager" code, now a widely used industry standard) +r=1/2 k=9 (Used on the IS-95 CDMA forward link) +r=1/6 k=15 ("Cassini" code, used by several NASA/JPL deep space missions) + +2. Reed-Solomon encoders and decoders for any user-specified code. + +3. Optimized encoder and decoder for the CCSDS-standard (255,223) +Reed-Solomon code, with and without the CCSDS-standard "dual basis" +symbol representation. + +4. Compute dot product between a 16-bit buffer and a set of 16-bit +coefficients. This is the basic DSP primitive for digital filtering +and correlation. + +4. Compute sum of squares of a buffer of 16-bit signed integers. This is +useful in DSP for finding the total energy in a signal. + +5. Find peak value in a buffer of 16-bit signed integers, useful for +scaling a signal to prevent overflow. + +SIMD SUPPORT + +This package automatically makes use of various SIMD (Single +Instruction stream, Multiple Data stream) instruction sets, when +available: MMX, SSE and SSE2 on the IA-32 (Intel) architecture, and +Altivec on the PowerPC G4 and G5 used by Power Macintoshes. + +"Altivec" is a Motorola trademark; Apple calls it "Velocity Engine", +and IBM calls it "VMX". Altivec is roughly comparable to SSE2 on the +IA-32. + +Many of the SIMD versions run more than an order of +magnitude faster than their portable C versions. The available SIMD +instruction sets, if any, are determined at run time and the proper +version of each routine is automatically selected. If no SIMD +instructions are available, the portable C version is invoked by +default. On targets other than IA-32 and PPC, only the portable C +version is built. + +The SIMD-assisted versions generally produce the same results as the C +versions, with a few minor exceptions. The Viterbi decoders in C have +a very slightly greater Eb/No performance due to their use of 32-bit +path metrics. On the other hand, the SIMD versions use the +"saturating" arithmetic available in these instructions to avoid the +integer wraparounds that can occur in C when argument ranges are not +properly constrained. This applies primarily to the "dotprod" (dot +product) function. + +The MMX (MultiMedia eXtensions) instruction set was introduced on +later Pentium CPUs; it is also implemented on the Pentium II and most +AMD CPUs starting with the K6. SSE (SIMD Streaming Extensions) was +introduced in the Pentium III; AMD calls it "3D Now! Professional". +Intel introduced SSE2 on the Pentium 4, and it has been picked up by +later AMD CPUs. SSE support implies MMX support, while SSE2 support +implies both SSE and MMX support. + +The latest IA-32 SIMD instruction set, SSE3 (also known as "Prescott +New Instructions") was introduced in early 2004 with the latest +("Prescott") revision of the Pentium 4. Relatively little was +introduced with SSE3, and this library currently makes no use of it. + +See the various manual pages for details on how to use the library +routines. + +Copyright 2006, Phil Karn, KA9Q +karn@ka9q.net +http://www.ka9q.net/ + +This software may be used under the terms of the GNU Lesser General +Public License (LGPL); see the file lesser.txt for details. + +Revision history: +Version 1.0 released 29 May 2001 + +Version 2.0 released 3 Dec 2001: +Restructured to add support for shared libraries. + +Version 2.0.1 released 8 Dec 2001: +Includes autoconf/configure script + +Version 2.0.2 released 4 Feb 2002: +Add SIMD version override options +Test for lack of SSE2 mnemonic support in 'as' +Build only selected version + +Version 2.0.3 released 6 Feb 2002: +Fix to parityb function in parity.h + +feclib version 1.0 released November 2003 +Merged SIMD-Viterbi, RS and DSP libraries +Changed SIMD Viterbi decoder to detect SSE2/SSE/MMX at runtime rather than build time + +feclib version 2.0 (unreleased) Mar 2004 +General speedups and cleanups +Switch from 4 to 8-bit input symbols on all Viterbi decoders +Support for Altivec on PowerPC +Support for k=15 r=1/6 Cassini/Mars Pathfinder/Mars Exploration Rover/STEREO code +Changed license to GNU Lesser General Public License (LGPL) + +feclib version 2.1 June 5 2006 +Added error checking, fixed alignment bug in SSE2 versions of Viterbi decoders causing segfaults + +feclib version 2.1.1 June 6 2006 +Fix test/benchmark time measurement on Linux diff --git a/libfec/README.x86-64 b/libfec/README.x86-64 new file mode 100644 index 0000000..bb4450c --- /dev/null +++ b/libfec/README.x86-64 @@ -0,0 +1,13 @@ +This library has been modified to compile natively on x86-64. + +An attempt has been made to adapt the assembly code, but due to unsolved issues with +the fact that shared libraries on x86-64 have to be compiled with PIC, this approach is +not finished. + +This code therefore only uses the portable C implementation, which is certainly slower than +the assembly SSE2 that could ideally be used. + +It could be said that we trade performance against the possibility to compile on x86-64. + +feb, 2012 +Matthias P. Braendli, HB9EGM diff --git a/libfec/bootstrap b/libfec/bootstrap new file mode 100755 index 0000000..2f58d5c --- /dev/null +++ b/libfec/bootstrap @@ -0,0 +1,6 @@ +#!/bin/bash + +aclocal && \ +autoheader && \ +autoconf + diff --git a/libfec/ccsds.h b/libfec/ccsds.h new file mode 100644 index 0000000..ae65468 --- /dev/null +++ b/libfec/ccsds.h @@ -0,0 +1,5 @@ +typedef unsigned char data_t; +extern unsigned char Taltab[],Tal1tab[]; +#define NN 255 +#define NROOTS 32 + diff --git a/libfec/char.h b/libfec/char.h new file mode 100644 index 0000000..25efd65 --- /dev/null +++ b/libfec/char.h @@ -0,0 +1,24 @@ +/* Stuff specific to the 8-bit symbol version of the general purpose RS codecs + * + * Copyright 2003, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +typedef unsigned char data_t; + +#define MODNN(x) modnn(rs,x) + +#define MM (rs->mm) +#define NN (rs->nn) +#define ALPHA_TO (rs->alpha_to) +#define INDEX_OF (rs->index_of) +#define GENPOLY (rs->genpoly) +#define NROOTS (rs->nroots) +#define FCR (rs->fcr) +#define PRIM (rs->prim) +#define IPRIM (rs->iprim) +#define PAD (rs->pad) +#define A0 (NN) + + + + diff --git a/libfec/cmake/Modules/Version.cmake b/libfec/cmake/Modules/Version.cmake new file mode 100644 index 0000000..e8d5bd5 --- /dev/null +++ b/libfec/cmake/Modules/Version.cmake @@ -0,0 +1,115 @@ +# Portions of this file have been borrowed from and/or inspired by +# the Version.cmake from the rtl-sdr project. +# http://sdr.osmocom.org/trac/wiki/rtl-sdr +# +# Provides: +# ${VERSION_INFO_BASE} - Major.Minor.Patch +# ${VERSION_INFO} - Major.minor.Patch[-git_info] +# +# Requires values for: +# ${VERSION_INFO_MAJOR} - Increment on API compatibility changes. +# ${VERSION_INFO_MINOR} - Increment when adding features. +# ${VERSION_INFO_PATCH} - Increment for bug and documentation changes. +# +# Optional: +# ${VERSION_INFO_EXTRA} - Set to "git" to append git info. This is +# intended only for non-versioned development +# builds +# ${VERSION_INFO_OVERRIDE} - Set to a non-null value to override the +# VERSION_INFO_EXTRA logic. This is intended +# for automated snapshot builds from exported +# trees, to pass in the git revision info. +# +if(DEFINED __INCLUDED_TOOLAME-DAB_VERSION_CMAKE) + return() +endif() +set(__INCLUDED_TOOLAME-DAB_VERSION_CMAKE TRUE) + +################################################################################ +# Gather up variables provided by parent script +################################################################################ + +if(NOT DEFINED VERSION_INFO_MAJOR) + message(FATAL_ERROR "VERSION_INFO_MAJOR is not defined") +else() + set(VER_MAJ ${VERSION_INFO_MAJOR}) +endif() + +if(NOT DEFINED VERSION_INFO_MINOR) + message(FATAL_ERROR "VERSION_INFO_MINOR is not defined") +else() + set(VER_MIN ${VERSION_INFO_MINOR}) +endif() + +if(NOT DEFINED VERSION_INFO_PATCH) + message(FATAL_ERROR "VERSION_INFO_PATCH is not defined") +else() + set(VER_PAT ${VERSION_INFO_PATCH}) +endif() + + +################################################################################ +# Craft version number, using git, if needed +################################################################################ +find_package(Git QUIET) + +if(GIT_FOUND) + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse -- + ERROR_QUIET + RESULT_VARIABLE NOT_GIT_REPOSITORY + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + ) + + if(NOT_GIT_REPOSITORY) + set(GIT_INFO "-unknown") + else() + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD -- + OUTPUT_VARIABLE GIT_REV OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + ) + + execute_process( + COMMAND ${GIT_EXECUTABLE} diff-index --quiet HEAD -- + RESULT_VARIABLE GIT_DIRTY + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + ) + + if(GIT_DIRTY) + set(GIT_INFO "-${GIT_REV}-dirty") + else() + set(GIT_INFO "-${GIT_REV}") + endif() + endif() + +else() + message(WARNING "git missing -- unable to check libfec version.") + unset(NOT_GIT_REPOSITORY) + unset(GIT_REV) + unset(GIT_DIRTY) +endif() + + +################################################################################ +# Provide +################################################################################ +set(VERSION_INFO_BASE "${VER_MAJ}.${VER_MIN}.${VER_PAT}") + +# Force the version suffix. Used for automated export builds. +if(VERSION_INFO_OVERRIDE) + set(VERSION_INFO "${VERSION_INFO_BASE}-${VERSION_INFO_OVERRIDE}") + +# Intra-release builds +elseif("${VERSION_INFO_EXTRA}" STREQUAL "git") + set(VERSION_INFO "${VERSION_INFO_BASE}-git${GIT_INFO}") + +# Versioned releases +elseif("${VERSION_INFO_EXTRA}" STREQUAL "") + set(VERSION_INFO "${VERSION_INFO_BASE}") + +# Invalid +else() + message(FATAL_ERROR + "Unexpected definition of VERSION_INFO_EXTRA: ${VERSION_INFO_EXTRA}") +endif() diff --git a/libfec/cmake/cmake_uninstall.cmake.in b/libfec/cmake/cmake_uninstall.cmake.in new file mode 100644 index 0000000..2037e36 --- /dev/null +++ b/libfec/cmake/cmake_uninstall.cmake.in @@ -0,0 +1,21 @@ +if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") + message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") +endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") + +file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files) +string(REGEX REPLACE "\n" ";" files "${files}") +foreach(file ${files}) + message(STATUS "Uninstalling $ENV{DESTDIR}${file}") + if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") + exec_program( + "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\"" + OUTPUT_VARIABLE rm_out + RETURN_VALUE rm_retval + ) + if(NOT "${rm_retval}" STREQUAL 0) + message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}") + endif(NOT "${rm_retval}" STREQUAL 0) + else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") + message(STATUS "File $ENV{DESTDIR}${file} does not exist.") + endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") +endforeach(file) diff --git a/libfec/config.guess b/libfec/config.guess new file mode 100644 index 0000000..0f0fe71 --- /dev/null +++ b/libfec/config.guess @@ -0,0 +1,1516 @@ +#! /bin/sh +# Attempt to guess a canonical system name. +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, +# 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation, +# Inc. + +timestamp='2007-03-06' + +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA +# 02110-1301, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + + +# Originally written by Per Bothner . +# Please send patches to . Submit a context +# diff and a properly formatted ChangeLog entry. +# +# This script attempts to guess a canonical system name similar to +# config.sub. If it succeeds, it prints the system name on stdout, and +# exits with 0. Otherwise, it exits with 1. +# +# The plan is that this can be called by configure scripts if you +# don't specify an explicit build system type. + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] + +Output the configuration name of the system \`$me' is run on. + +Operation modes: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.guess ($timestamp) + +Originally written by Per Bothner. +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005 +Free Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try \`$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit ;; + --version | -v ) + echo "$version" ; exit ;; + --help | --h* | -h ) + echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" >&2 + exit 1 ;; + * ) + break ;; + esac +done + +if test $# != 0; then + echo "$me: too many arguments$help" >&2 + exit 1 +fi + +trap 'exit 1' 1 2 15 + +# CC_FOR_BUILD -- compiler used by this script. Note that the use of a +# compiler to aid in system detection is discouraged as it requires +# temporary files to be created and, as you can see below, it is a +# headache to deal with in a portable fashion. + +# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still +# use `HOST_CC' if defined, but it is deprecated. + +# Portable tmp directory creation inspired by the Autoconf team. + +set_cc_for_build=' +trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; +trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; +: ${TMPDIR=/tmp} ; + { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || + { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || + { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || + { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; +dummy=$tmp/dummy ; +tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; +case $CC_FOR_BUILD,$HOST_CC,$CC in + ,,) echo "int x;" > $dummy.c ; + for c in cc gcc c89 c99 ; do + if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then + CC_FOR_BUILD="$c"; break ; + fi ; + done ; + if test x"$CC_FOR_BUILD" = x ; then + CC_FOR_BUILD=no_compiler_found ; + fi + ;; + ,,*) CC_FOR_BUILD=$CC ;; + ,*,*) CC_FOR_BUILD=$HOST_CC ;; +esac ; set_cc_for_build= ;' + +# This is needed to find uname on a Pyramid OSx when run in the BSD universe. +# (ghazi@noc.rutgers.edu 1994-08-24) +if (test -f /.attbin/uname) >/dev/null 2>&1 ; then + PATH=$PATH:/.attbin ; export PATH +fi + +UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown +UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown +UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown +UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown + +# Note: order is significant - the case branches are not exclusive. + +case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in + *:NetBSD:*:*) + # NetBSD (nbsd) targets should (where applicable) match one or + # more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*, + # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently + # switched to ELF, *-*-netbsd* would select the old + # object file format. This provides both forward + # compatibility and a consistent mechanism for selecting the + # object file format. + # + # Note: NetBSD doesn't particularly care about the vendor + # portion of the name. We always set it to "unknown". + sysctl="sysctl -n hw.machine_arch" + UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ + /usr/sbin/$sysctl 2>/dev/null || echo unknown)` + case "${UNAME_MACHINE_ARCH}" in + armeb) machine=armeb-unknown ;; + arm*) machine=arm-unknown ;; + sh3el) machine=shl-unknown ;; + sh3eb) machine=sh-unknown ;; + sh5el) machine=sh5le-unknown ;; + *) machine=${UNAME_MACHINE_ARCH}-unknown ;; + esac + # The Operating System including object format, if it has switched + # to ELF recently, or will in the future. + case "${UNAME_MACHINE_ARCH}" in + arm*|i386|m68k|ns32k|sh3*|sparc|vax) + eval $set_cc_for_build + if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep __ELF__ >/dev/null + then + # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). + # Return netbsd for either. FIX? + os=netbsd + else + os=netbsdelf + fi + ;; + *) + os=netbsd + ;; + esac + # The OS release + # Debian GNU/NetBSD machines have a different userland, and + # thus, need a distinct triplet. However, they do not need + # kernel version information, so it can be replaced with a + # suitable tag, in the style of linux-gnu. + case "${UNAME_VERSION}" in + Debian*) + release='-gnu' + ;; + *) + release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` + ;; + esac + # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: + # contains redundant information, the shorter form: + # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. + echo "${machine}-${os}${release}" + exit ;; + *:OpenBSD:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` + echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} + exit ;; + *:ekkoBSD:*:*) + echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} + exit ;; + *:SolidBSD:*:*) + echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE} + exit ;; + macppc:MirBSD:*:*) + echo powerpc-unknown-mirbsd${UNAME_RELEASE} + exit ;; + *:MirBSD:*:*) + echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} + exit ;; + alpha:OSF1:*:*) + case $UNAME_RELEASE in + *4.0) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` + ;; + *5.*) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` + ;; + esac + # According to Compaq, /usr/sbin/psrinfo has been available on + # OSF/1 and Tru64 systems produced since 1995. I hope that + # covers most systems running today. This code pipes the CPU + # types through head -n 1, so we only detect the type of CPU 0. + ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` + case "$ALPHA_CPU_TYPE" in + "EV4 (21064)") + UNAME_MACHINE="alpha" ;; + "EV4.5 (21064)") + UNAME_MACHINE="alpha" ;; + "LCA4 (21066/21068)") + UNAME_MACHINE="alpha" ;; + "EV5 (21164)") + UNAME_MACHINE="alphaev5" ;; + "EV5.6 (21164A)") + UNAME_MACHINE="alphaev56" ;; + "EV5.6 (21164PC)") + UNAME_MACHINE="alphapca56" ;; + "EV5.7 (21164PC)") + UNAME_MACHINE="alphapca57" ;; + "EV6 (21264)") + UNAME_MACHINE="alphaev6" ;; + "EV6.7 (21264A)") + UNAME_MACHINE="alphaev67" ;; + "EV6.8CB (21264C)") + UNAME_MACHINE="alphaev68" ;; + "EV6.8AL (21264B)") + UNAME_MACHINE="alphaev68" ;; + "EV6.8CX (21264D)") + UNAME_MACHINE="alphaev68" ;; + "EV6.9A (21264/EV69A)") + UNAME_MACHINE="alphaev69" ;; + "EV7 (21364)") + UNAME_MACHINE="alphaev7" ;; + "EV7.9 (21364A)") + UNAME_MACHINE="alphaev79" ;; + esac + # A Pn.n version is a patched version. + # A Vn.n version is a released version. + # A Tn.n version is a released field test version. + # A Xn.n version is an unreleased experimental baselevel. + # 1.2 uses "1.2" for uname -r. + echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + exit ;; + Alpha\ *:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # Should we change UNAME_MACHINE based on the output of uname instead + # of the specific Alpha model? + echo alpha-pc-interix + exit ;; + 21064:Windows_NT:50:3) + echo alpha-dec-winnt3.5 + exit ;; + Amiga*:UNIX_System_V:4.0:*) + echo m68k-unknown-sysv4 + exit ;; + *:[Aa]miga[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-amigaos + exit ;; + *:[Mm]orph[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-morphos + exit ;; + *:OS/390:*:*) + echo i370-ibm-openedition + exit ;; + *:z/VM:*:*) + echo s390-ibm-zvmoe + exit ;; + *:OS400:*:*) + echo powerpc-ibm-os400 + exit ;; + arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) + echo arm-acorn-riscix${UNAME_RELEASE} + exit ;; + arm:riscos:*:*|arm:RISCOS:*:*) + echo arm-unknown-riscos + exit ;; + SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) + echo hppa1.1-hitachi-hiuxmpp + exit ;; + Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) + # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. + if test "`(/bin/universe) 2>/dev/null`" = att ; then + echo pyramid-pyramid-sysv3 + else + echo pyramid-pyramid-bsd + fi + exit ;; + NILE*:*:*:dcosx) + echo pyramid-pyramid-svr4 + exit ;; + DRS?6000:unix:4.0:6*) + echo sparc-icl-nx6 + exit ;; + DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) + case `/usr/bin/uname -p` in + sparc) echo sparc-icl-nx7; exit ;; + esac ;; + sun4H:SunOS:5.*:*) + echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) + echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + i86pc:SunOS:5.*:*) + echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:6*:*) + # According to config.sub, this is the proper way to canonicalize + # SunOS6. Hard to guess exactly what SunOS6 will be like, but + # it's likely to be more like Solaris than SunOS4. + echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:*:*) + case "`/usr/bin/arch -k`" in + Series*|S4*) + UNAME_RELEASE=`uname -v` + ;; + esac + # Japanese Language versions have a version number like `4.1.3-JL'. + echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` + exit ;; + sun3*:SunOS:*:*) + echo m68k-sun-sunos${UNAME_RELEASE} + exit ;; + sun*:*:4.2BSD:*) + UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` + test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 + case "`/bin/arch`" in + sun3) + echo m68k-sun-sunos${UNAME_RELEASE} + ;; + sun4) + echo sparc-sun-sunos${UNAME_RELEASE} + ;; + esac + exit ;; + aushp:SunOS:*:*) + echo sparc-auspex-sunos${UNAME_RELEASE} + exit ;; + # The situation for MiNT is a little confusing. The machine name + # can be virtually everything (everything which is not + # "atarist" or "atariste" at least should have a processor + # > m68000). The system name ranges from "MiNT" over "FreeMiNT" + # to the lowercase version "mint" (or "freemint"). Finally + # the system name "TOS" denotes a system which is actually not + # MiNT. But MiNT is downward compatible to TOS, so this should + # be no problem. + atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) + echo m68k-milan-mint${UNAME_RELEASE} + exit ;; + hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) + echo m68k-hades-mint${UNAME_RELEASE} + exit ;; + *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) + echo m68k-unknown-mint${UNAME_RELEASE} + exit ;; + m68k:machten:*:*) + echo m68k-apple-machten${UNAME_RELEASE} + exit ;; + powerpc:machten:*:*) + echo powerpc-apple-machten${UNAME_RELEASE} + exit ;; + RISC*:Mach:*:*) + echo mips-dec-mach_bsd4.3 + exit ;; + RISC*:ULTRIX:*:*) + echo mips-dec-ultrix${UNAME_RELEASE} + exit ;; + VAX*:ULTRIX*:*:*) + echo vax-dec-ultrix${UNAME_RELEASE} + exit ;; + 2020:CLIX:*:* | 2430:CLIX:*:*) + echo clipper-intergraph-clix${UNAME_RELEASE} + exit ;; + mips:*:*:UMIPS | mips:*:*:RISCos) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c +#ifdef __cplusplus +#include /* for printf() prototype */ + int main (int argc, char *argv[]) { +#else + int main (argc, argv) int argc; char *argv[]; { +#endif + #if defined (host_mips) && defined (MIPSEB) + #if defined (SYSTYPE_SYSV) + printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_SVR4) + printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) + printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); + #endif + #endif + exit (-1); + } +EOF + $CC_FOR_BUILD -o $dummy $dummy.c && + dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && + SYSTEM_NAME=`$dummy $dummyarg` && + { echo "$SYSTEM_NAME"; exit; } + echo mips-mips-riscos${UNAME_RELEASE} + exit ;; + Motorola:PowerMAX_OS:*:*) + echo powerpc-motorola-powermax + exit ;; + Motorola:*:4.3:PL8-*) + echo powerpc-harris-powermax + exit ;; + Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) + echo powerpc-harris-powermax + exit ;; + Night_Hawk:Power_UNIX:*:*) + echo powerpc-harris-powerunix + exit ;; + m88k:CX/UX:7*:*) + echo m88k-harris-cxux7 + exit ;; + m88k:*:4*:R4*) + echo m88k-motorola-sysv4 + exit ;; + m88k:*:3*:R3*) + echo m88k-motorola-sysv3 + exit ;; + AViiON:dgux:*:*) + # DG/UX returns AViiON for all architectures + UNAME_PROCESSOR=`/usr/bin/uname -p` + if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] + then + if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ + [ ${TARGET_BINARY_INTERFACE}x = x ] + then + echo m88k-dg-dgux${UNAME_RELEASE} + else + echo m88k-dg-dguxbcs${UNAME_RELEASE} + fi + else + echo i586-dg-dgux${UNAME_RELEASE} + fi + exit ;; + M88*:DolphinOS:*:*) # DolphinOS (SVR3) + echo m88k-dolphin-sysv3 + exit ;; + M88*:*:R3*:*) + # Delta 88k system running SVR3 + echo m88k-motorola-sysv3 + exit ;; + XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) + echo m88k-tektronix-sysv3 + exit ;; + Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) + echo m68k-tektronix-bsd + exit ;; + *:IRIX*:*:*) + echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` + exit ;; + ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. + echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id + exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' + i*86:AIX:*:*) + echo i386-ibm-aix + exit ;; + ia64:AIX:*:*) + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} + exit ;; + *:AIX:2:3) + if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include + + main() + { + if (!__power_pc()) + exit(1); + puts("powerpc-ibm-aix3.2.5"); + exit(0); + } +EOF + if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` + then + echo "$SYSTEM_NAME" + else + echo rs6000-ibm-aix3.2.5 + fi + elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then + echo rs6000-ibm-aix3.2.4 + else + echo rs6000-ibm-aix3.2 + fi + exit ;; + *:AIX:*:[45]) + IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` + if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then + IBM_ARCH=rs6000 + else + IBM_ARCH=powerpc + fi + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${IBM_ARCH}-ibm-aix${IBM_REV} + exit ;; + *:AIX:*:*) + echo rs6000-ibm-aix + exit ;; + ibmrt:4.4BSD:*|romp-ibm:BSD:*) + echo romp-ibm-bsd4.4 + exit ;; + ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and + echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to + exit ;; # report: romp-ibm BSD 4.3 + *:BOSX:*:*) + echo rs6000-bull-bosx + exit ;; + DPX/2?00:B.O.S.:*:*) + echo m68k-bull-sysv3 + exit ;; + 9000/[34]??:4.3bsd:1.*:*) + echo m68k-hp-bsd + exit ;; + hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) + echo m68k-hp-bsd4.4 + exit ;; + 9000/[34678]??:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + case "${UNAME_MACHINE}" in + 9000/31? ) HP_ARCH=m68000 ;; + 9000/[34]?? ) HP_ARCH=m68k ;; + 9000/[678][0-9][0-9]) + if [ -x /usr/bin/getconf ]; then + sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` + sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` + case "${sc_cpu_version}" in + 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 + 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 + 532) # CPU_PA_RISC2_0 + case "${sc_kernel_bits}" in + 32) HP_ARCH="hppa2.0n" ;; + 64) HP_ARCH="hppa2.0w" ;; + '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 + esac ;; + esac + fi + if [ "${HP_ARCH}" = "" ]; then + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + + #define _HPUX_SOURCE + #include + #include + + int main () + { + #if defined(_SC_KERNEL_BITS) + long bits = sysconf(_SC_KERNEL_BITS); + #endif + long cpu = sysconf (_SC_CPU_VERSION); + + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1"); break; + case CPU_PA_RISC2_0: + #if defined(_SC_KERNEL_BITS) + switch (bits) + { + case 64: puts ("hppa2.0w"); break; + case 32: puts ("hppa2.0n"); break; + default: puts ("hppa2.0"); break; + } break; + #else /* !defined(_SC_KERNEL_BITS) */ + puts ("hppa2.0"); break; + #endif + default: puts ("hppa1.0"); break; + } + exit (0); + } +EOF + (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` + test -z "$HP_ARCH" && HP_ARCH=hppa + fi ;; + esac + if [ ${HP_ARCH} = "hppa2.0w" ] + then + eval $set_cc_for_build + + # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating + # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler + # generating 64-bit code. GNU and HP use different nomenclature: + # + # $ CC_FOR_BUILD=cc ./config.guess + # => hppa2.0w-hp-hpux11.23 + # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess + # => hppa64-hp-hpux11.23 + + if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | + grep __LP64__ >/dev/null + then + HP_ARCH="hppa2.0w" + else + HP_ARCH="hppa64" + fi + fi + echo ${HP_ARCH}-hp-hpux${HPUX_REV} + exit ;; + ia64:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + echo ia64-hp-hpux${HPUX_REV} + exit ;; + 3050*:HI-UX:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include + int + main () + { + long cpu = sysconf (_SC_CPU_VERSION); + /* The order matters, because CPU_IS_HP_MC68K erroneously returns + true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct + results, however. */ + if (CPU_IS_PA_RISC (cpu)) + { + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; + case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; + default: puts ("hppa-hitachi-hiuxwe2"); break; + } + } + else if (CPU_IS_HP_MC68K (cpu)) + puts ("m68k-hitachi-hiuxwe2"); + else puts ("unknown-hitachi-hiuxwe2"); + exit (0); + } +EOF + $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && + { echo "$SYSTEM_NAME"; exit; } + echo unknown-hitachi-hiuxwe2 + exit ;; + 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) + echo hppa1.1-hp-bsd + exit ;; + 9000/8??:4.3bsd:*:*) + echo hppa1.0-hp-bsd + exit ;; + *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) + echo hppa1.0-hp-mpeix + exit ;; + hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) + echo hppa1.1-hp-osf + exit ;; + hp8??:OSF1:*:*) + echo hppa1.0-hp-osf + exit ;; + i*86:OSF1:*:*) + if [ -x /usr/sbin/sysversion ] ; then + echo ${UNAME_MACHINE}-unknown-osf1mk + else + echo ${UNAME_MACHINE}-unknown-osf1 + fi + exit ;; + parisc*:Lites*:*:*) + echo hppa1.1-hp-lites + exit ;; + C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) + echo c1-convex-bsd + exit ;; + C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit ;; + C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) + echo c34-convex-bsd + exit ;; + C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) + echo c38-convex-bsd + exit ;; + C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) + echo c4-convex-bsd + exit ;; + CRAY*Y-MP:*:*:*) + echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*[A-Z]90:*:*:*) + echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ + | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ + -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ + -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*TS:*:*:*) + echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*T3E:*:*:*) + echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*SV1:*:*:*) + echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + *:UNICOS/mp:*:*) + echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) + FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` + echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit ;; + 5000:UNIX_System_V:4.*:*) + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` + echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit ;; + i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) + echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} + exit ;; + sparc*:BSD/OS:*:*) + echo sparc-unknown-bsdi${UNAME_RELEASE} + exit ;; + *:BSD/OS:*:*) + echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} + exit ;; + *:FreeBSD:*:*) + case ${UNAME_MACHINE} in + pc98) + echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + amd64) + echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + *) + echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + esac + exit ;; + i*:CYGWIN*:*) + echo ${UNAME_MACHINE}-pc-cygwin + exit ;; + *:MINGW*:*) + echo ${UNAME_MACHINE}-pc-mingw32 + exit ;; + i*:windows32*:*) + # uname -m includes "-pc" on this system. + echo ${UNAME_MACHINE}-mingw32 + exit ;; + i*:PW*:*) + echo ${UNAME_MACHINE}-pc-pw32 + exit ;; + *:Interix*:[3456]*) + case ${UNAME_MACHINE} in + x86) + echo i586-pc-interix${UNAME_RELEASE} + exit ;; + EM64T | authenticamd) + echo x86_64-unknown-interix${UNAME_RELEASE} + exit ;; + esac ;; + [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) + echo i${UNAME_MACHINE}-pc-mks + exit ;; + i*:Windows_NT*:* | Pentium*:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we + # UNAME_MACHINE based on the output of uname instead of i386? + echo i586-pc-interix + exit ;; + i*:UWIN*:*) + echo ${UNAME_MACHINE}-pc-uwin + exit ;; + amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) + echo x86_64-unknown-cygwin + exit ;; + p*:CYGWIN*:*) + echo powerpcle-unknown-cygwin + exit ;; + prep*:SunOS:5.*:*) + echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + *:GNU:*:*) + # the GNU system + echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` + exit ;; + *:GNU/*:*:*) + # other systems with GNU libc and userland + echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu + exit ;; + i*86:Minix:*:*) + echo ${UNAME_MACHINE}-pc-minix + exit ;; + arm*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + avr32*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + cris:Linux:*:*) + echo cris-axis-linux-gnu + exit ;; + crisv32:Linux:*:*) + echo crisv32-axis-linux-gnu + exit ;; + frv:Linux:*:*) + echo frv-unknown-linux-gnu + exit ;; + ia64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + m32r*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + m68*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + mips:Linux:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #undef CPU + #undef mips + #undef mipsel + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) + CPU=mipsel + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) + CPU=mips + #else + CPU= + #endif + #endif +EOF + eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' + /^CPU/{ + s: ::g + p + }'`" + test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } + ;; + mips64:Linux:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #undef CPU + #undef mips64 + #undef mips64el + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) + CPU=mips64el + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) + CPU=mips64 + #else + CPU= + #endif + #endif +EOF + eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' + /^CPU/{ + s: ::g + p + }'`" + test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } + ;; + or32:Linux:*:*) + echo or32-unknown-linux-gnu + exit ;; + ppc:Linux:*:*) + echo powerpc-unknown-linux-gnu + exit ;; + ppc64:Linux:*:*) + echo powerpc64-unknown-linux-gnu + exit ;; + alpha:Linux:*:*) + case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in + EV5) UNAME_MACHINE=alphaev5 ;; + EV56) UNAME_MACHINE=alphaev56 ;; + PCA56) UNAME_MACHINE=alphapca56 ;; + PCA57) UNAME_MACHINE=alphapca56 ;; + EV6) UNAME_MACHINE=alphaev6 ;; + EV67) UNAME_MACHINE=alphaev67 ;; + EV68*) UNAME_MACHINE=alphaev68 ;; + esac + objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null + if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi + echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} + exit ;; + parisc:Linux:*:* | hppa:Linux:*:*) + # Look for CPU level + case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in + PA7*) echo hppa1.1-unknown-linux-gnu ;; + PA8*) echo hppa2.0-unknown-linux-gnu ;; + *) echo hppa-unknown-linux-gnu ;; + esac + exit ;; + parisc64:Linux:*:* | hppa64:Linux:*:*) + echo hppa64-unknown-linux-gnu + exit ;; + s390:Linux:*:* | s390x:Linux:*:*) + echo ${UNAME_MACHINE}-ibm-linux + exit ;; + sh64*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + sh*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + sparc:Linux:*:* | sparc64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + vax:Linux:*:*) + echo ${UNAME_MACHINE}-dec-linux-gnu + exit ;; + x86_64:Linux:*:*) + echo x86_64-unknown-linux-gnu + exit ;; + xtensa:Linux:*:*) + echo xtensa-unknown-linux-gnu + exit ;; + i*86:Linux:*:*) + # The BFD linker knows what the default object file format is, so + # first see if it will tell us. cd to the root directory to prevent + # problems with other programs or directories called `ld' in the path. + # Set LC_ALL=C to ensure ld outputs messages in English. + ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \ + | sed -ne '/supported targets:/!d + s/[ ][ ]*/ /g + s/.*supported targets: *// + s/ .*// + p'` + case "$ld_supported_targets" in + elf32-i386) + TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu" + ;; + a.out-i386-linux) + echo "${UNAME_MACHINE}-pc-linux-gnuaout" + exit ;; + coff-i386) + echo "${UNAME_MACHINE}-pc-linux-gnucoff" + exit ;; + "") + # Either a pre-BFD a.out linker (linux-gnuoldld) or + # one that does not give us useful --help. + echo "${UNAME_MACHINE}-pc-linux-gnuoldld" + exit ;; + esac + # Determine whether the default compiler is a.out or elf + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include + #ifdef __ELF__ + # ifdef __GLIBC__ + # if __GLIBC__ >= 2 + LIBC=gnu + # else + LIBC=gnulibc1 + # endif + # else + LIBC=gnulibc1 + # endif + #else + #if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) + LIBC=gnu + #else + LIBC=gnuaout + #endif + #endif + #ifdef __dietlibc__ + LIBC=dietlibc + #endif +EOF + eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' + /^LIBC/{ + s: ::g + p + }'`" + test x"${LIBC}" != x && { + echo "${UNAME_MACHINE}-pc-linux-${LIBC}" + exit + } + test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; } + ;; + i*86:DYNIX/ptx:4*:*) + # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. + # earlier versions are messed up and put the nodename in both + # sysname and nodename. + echo i386-sequent-sysv4 + exit ;; + i*86:UNIX_SV:4.2MP:2.*) + # Unixware is an offshoot of SVR4, but it has its own version + # number series starting with 2... + # I am not positive that other SVR4 systems won't match this, + # I just have to hope. -- rms. + # Use sysv4.2uw... so that sysv4* matches it. + echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} + exit ;; + i*86:OS/2:*:*) + # If we were able to find `uname', then EMX Unix compatibility + # is probably installed. + echo ${UNAME_MACHINE}-pc-os2-emx + exit ;; + i*86:XTS-300:*:STOP) + echo ${UNAME_MACHINE}-unknown-stop + exit ;; + i*86:atheos:*:*) + echo ${UNAME_MACHINE}-unknown-atheos + exit ;; + i*86:syllable:*:*) + echo ${UNAME_MACHINE}-pc-syllable + exit ;; + i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*) + echo i386-unknown-lynxos${UNAME_RELEASE} + exit ;; + i*86:*DOS:*:*) + echo ${UNAME_MACHINE}-pc-msdosdjgpp + exit ;; + i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) + UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` + if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then + echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} + else + echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} + fi + exit ;; + i*86:*:5:[678]*) + # UnixWare 7.x, OpenUNIX and OpenServer 6. + case `/bin/uname -X | grep "^Machine"` in + *486*) UNAME_MACHINE=i486 ;; + *Pentium) UNAME_MACHINE=i586 ;; + *Pent*|*Celeron) UNAME_MACHINE=i686 ;; + esac + echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} + exit ;; + i*86:*:3.2:*) + if test -f /usr/options/cb.name; then + UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then + UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` + (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 + (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ + && UNAME_MACHINE=i586 + (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ + && UNAME_MACHINE=i686 + (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ + && UNAME_MACHINE=i686 + echo ${UNAME_MACHINE}-pc-sco$UNAME_REL + else + echo ${UNAME_MACHINE}-pc-sysv32 + fi + exit ;; + pc:*:*:*) + # Left here for compatibility: + # uname -m prints for DJGPP always 'pc', but it prints nothing about + # the processor, so we play safe by assuming i386. + echo i386-pc-msdosdjgpp + exit ;; + Intel:Mach:3*:*) + echo i386-pc-mach3 + exit ;; + paragon:*:*:*) + echo i860-intel-osf1 + exit ;; + i860:*:4.*:*) # i860-SVR4 + if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then + echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 + else # Add other i860-SVR4 vendors below as they are discovered. + echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 + fi + exit ;; + mini*:CTIX:SYS*5:*) + # "miniframe" + echo m68010-convergent-sysv + exit ;; + mc68k:UNIX:SYSTEM5:3.51m) + echo m68k-convergent-sysv + exit ;; + M680?0:D-NIX:5.3:*) + echo m68k-diab-dnix + exit ;; + M68*:*:R3V[5678]*:*) + test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; + 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) + OS_REL='' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4.3${OS_REL}; exit; } + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ + && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; + 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4; exit; } ;; + m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) + echo m68k-unknown-lynxos${UNAME_RELEASE} + exit ;; + mc68030:UNIX_System_V:4.*:*) + echo m68k-atari-sysv4 + exit ;; + TSUNAMI:LynxOS:2.*:*) + echo sparc-unknown-lynxos${UNAME_RELEASE} + exit ;; + rs6000:LynxOS:2.*:*) + echo rs6000-unknown-lynxos${UNAME_RELEASE} + exit ;; + PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*) + echo powerpc-unknown-lynxos${UNAME_RELEASE} + exit ;; + SM[BE]S:UNIX_SV:*:*) + echo mips-dde-sysv${UNAME_RELEASE} + exit ;; + RM*:ReliantUNIX-*:*:*) + echo mips-sni-sysv4 + exit ;; + RM*:SINIX-*:*:*) + echo mips-sni-sysv4 + exit ;; + *:SINIX-*:*:*) + if uname -p 2>/dev/null >/dev/null ; then + UNAME_MACHINE=`(uname -p) 2>/dev/null` + echo ${UNAME_MACHINE}-sni-sysv4 + else + echo ns32k-sni-sysv + fi + exit ;; + PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort + # says + echo i586-unisys-sysv4 + exit ;; + *:UNIX_System_V:4*:FTX*) + # From Gerald Hewes . + # How about differentiating between stratus architectures? -djm + echo hppa1.1-stratus-sysv4 + exit ;; + *:*:*:FTX*) + # From seanf@swdc.stratus.com. + echo i860-stratus-sysv4 + exit ;; + i*86:VOS:*:*) + # From Paul.Green@stratus.com. + echo ${UNAME_MACHINE}-stratus-vos + exit ;; + *:VOS:*:*) + # From Paul.Green@stratus.com. + echo hppa1.1-stratus-vos + exit ;; + mc68*:A/UX:*:*) + echo m68k-apple-aux${UNAME_RELEASE} + exit ;; + news*:NEWS-OS:6*:*) + echo mips-sony-newsos6 + exit ;; + R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) + if [ -d /usr/nec ]; then + echo mips-nec-sysv${UNAME_RELEASE} + else + echo mips-unknown-sysv${UNAME_RELEASE} + fi + exit ;; + BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. + echo powerpc-be-beos + exit ;; + BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. + echo powerpc-apple-beos + exit ;; + BePC:BeOS:*:*) # BeOS running on Intel PC compatible. + echo i586-pc-beos + exit ;; + SX-4:SUPER-UX:*:*) + echo sx4-nec-superux${UNAME_RELEASE} + exit ;; + SX-5:SUPER-UX:*:*) + echo sx5-nec-superux${UNAME_RELEASE} + exit ;; + SX-6:SUPER-UX:*:*) + echo sx6-nec-superux${UNAME_RELEASE} + exit ;; + SX-7:SUPER-UX:*:*) + echo sx7-nec-superux${UNAME_RELEASE} + exit ;; + SX-8:SUPER-UX:*:*) + echo sx8-nec-superux${UNAME_RELEASE} + exit ;; + SX-8R:SUPER-UX:*:*) + echo sx8r-nec-superux${UNAME_RELEASE} + exit ;; + Power*:Rhapsody:*:*) + echo powerpc-apple-rhapsody${UNAME_RELEASE} + exit ;; + *:Rhapsody:*:*) + echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} + exit ;; + *:Darwin:*:*) + UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown + case $UNAME_PROCESSOR in + unknown) UNAME_PROCESSOR=powerpc ;; + esac + echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} + exit ;; + *:procnto*:*:* | *:QNX:[0123456789]*:*) + UNAME_PROCESSOR=`uname -p` + if test "$UNAME_PROCESSOR" = "x86"; then + UNAME_PROCESSOR=i386 + UNAME_MACHINE=pc + fi + echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} + exit ;; + *:QNX:*:4*) + echo i386-pc-qnx + exit ;; + NSE-?:NONSTOP_KERNEL:*:*) + echo nse-tandem-nsk${UNAME_RELEASE} + exit ;; + NSR-?:NONSTOP_KERNEL:*:*) + echo nsr-tandem-nsk${UNAME_RELEASE} + exit ;; + *:NonStop-UX:*:*) + echo mips-compaq-nonstopux + exit ;; + BS2000:POSIX*:*:*) + echo bs2000-siemens-sysv + exit ;; + DS/*:UNIX_System_V:*:*) + echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} + exit ;; + *:Plan9:*:*) + # "uname -m" is not consistent, so use $cputype instead. 386 + # is converted to i386 for consistency with other x86 + # operating systems. + if test "$cputype" = "386"; then + UNAME_MACHINE=i386 + else + UNAME_MACHINE="$cputype" + fi + echo ${UNAME_MACHINE}-unknown-plan9 + exit ;; + *:TOPS-10:*:*) + echo pdp10-unknown-tops10 + exit ;; + *:TENEX:*:*) + echo pdp10-unknown-tenex + exit ;; + KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) + echo pdp10-dec-tops20 + exit ;; + XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) + echo pdp10-xkl-tops20 + exit ;; + *:TOPS-20:*:*) + echo pdp10-unknown-tops20 + exit ;; + *:ITS:*:*) + echo pdp10-unknown-its + exit ;; + SEI:*:*:SEIUX) + echo mips-sei-seiux${UNAME_RELEASE} + exit ;; + *:DragonFly:*:*) + echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` + exit ;; + *:*VMS:*:*) + UNAME_MACHINE=`(uname -p) 2>/dev/null` + case "${UNAME_MACHINE}" in + A*) echo alpha-dec-vms ; exit ;; + I*) echo ia64-dec-vms ; exit ;; + V*) echo vax-dec-vms ; exit ;; + esac ;; + *:XENIX:*:SysV) + echo i386-pc-xenix + exit ;; + i*86:skyos:*:*) + echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' + exit ;; + i*86:rdos:*:*) + echo ${UNAME_MACHINE}-pc-rdos + exit ;; +esac + +#echo '(No uname command or uname output not recognized.)' 1>&2 +#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2 + +eval $set_cc_for_build +cat >$dummy.c < +# include +#endif +main () +{ +#if defined (sony) +#if defined (MIPSEB) + /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, + I don't know.... */ + printf ("mips-sony-bsd\n"); exit (0); +#else +#include + printf ("m68k-sony-newsos%s\n", +#ifdef NEWSOS4 + "4" +#else + "" +#endif + ); exit (0); +#endif +#endif + +#if defined (__arm) && defined (__acorn) && defined (__unix) + printf ("arm-acorn-riscix\n"); exit (0); +#endif + +#if defined (hp300) && !defined (hpux) + printf ("m68k-hp-bsd\n"); exit (0); +#endif + +#if defined (NeXT) +#if !defined (__ARCHITECTURE__) +#define __ARCHITECTURE__ "m68k" +#endif + int version; + version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; + if (version < 4) + printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); + else + printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); + exit (0); +#endif + +#if defined (MULTIMAX) || defined (n16) +#if defined (UMAXV) + printf ("ns32k-encore-sysv\n"); exit (0); +#else +#if defined (CMU) + printf ("ns32k-encore-mach\n"); exit (0); +#else + printf ("ns32k-encore-bsd\n"); exit (0); +#endif +#endif +#endif + +#if defined (__386BSD__) + printf ("i386-pc-bsd\n"); exit (0); +#endif + +#if defined (sequent) +#if defined (i386) + printf ("i386-sequent-dynix\n"); exit (0); +#endif +#if defined (ns32000) + printf ("ns32k-sequent-dynix\n"); exit (0); +#endif +#endif + +#if defined (_SEQUENT_) + struct utsname un; + + uname(&un); + + if (strncmp(un.version, "V2", 2) == 0) { + printf ("i386-sequent-ptx2\n"); exit (0); + } + if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ + printf ("i386-sequent-ptx1\n"); exit (0); + } + printf ("i386-sequent-ptx\n"); exit (0); + +#endif + +#if defined (vax) +# if !defined (ultrix) +# include +# if defined (BSD) +# if BSD == 43 + printf ("vax-dec-bsd4.3\n"); exit (0); +# else +# if BSD == 199006 + printf ("vax-dec-bsd4.3reno\n"); exit (0); +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# endif +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# else + printf ("vax-dec-ultrix\n"); exit (0); +# endif +#endif + +#if defined (alliant) && defined (i860) + printf ("i860-alliant-bsd\n"); exit (0); +#endif + + exit (1); +} +EOF + +$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` && + { echo "$SYSTEM_NAME"; exit; } + +# Apollos put the system type in the environment. + +test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; } + +# Convex versions that predate uname can use getsysinfo(1) + +if [ -x /usr/convex/getsysinfo ] +then + case `getsysinfo -f cpu_type` in + c1*) + echo c1-convex-bsd + exit ;; + c2*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit ;; + c34*) + echo c34-convex-bsd + exit ;; + c38*) + echo c38-convex-bsd + exit ;; + c4*) + echo c4-convex-bsd + exit ;; + esac +fi + +cat >&2 < in order to provide the needed +information to handle your system. + +config.guess timestamp = $timestamp + +uname -m = `(uname -m) 2>/dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null` + +hostinfo = `(hostinfo) 2>/dev/null` +/bin/universe = `(/bin/universe) 2>/dev/null` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` +/bin/arch = `(/bin/arch) 2>/dev/null` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` + +UNAME_MACHINE = ${UNAME_MACHINE} +UNAME_RELEASE = ${UNAME_RELEASE} +UNAME_SYSTEM = ${UNAME_SYSTEM} +UNAME_VERSION = ${UNAME_VERSION} +EOF + +exit 1 + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/libfec/config.sub b/libfec/config.sub new file mode 100755 index 0000000..a06a480 --- /dev/null +++ b/libfec/config.sub @@ -0,0 +1,1362 @@ +#! /bin/sh +# Configuration validation subroutine script. +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001 +# Free Software Foundation, Inc. + +timestamp='2001-04-20' + +# This file is (in principle) common to ALL GNU software. +# The presence of a machine in this file suggests that SOME GNU software +# can handle that machine. It does not imply ALL GNU software can. +# +# This file is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# Please send patches to . +# +# Configuration subroutine to validate and canonicalize a configuration type. +# Supply the specified configuration type as an argument. +# If it is invalid, we print an error message on stderr and exit with code 1. +# Otherwise, we print the canonical config type on stdout and succeed. + +# This file is supposed to be the same for all GNU packages +# and recognize all the CPU types, system types and aliases +# that are meaningful with *any* GNU software. +# Each package is responsible for reporting which valid configurations +# it does not support. The user should be able to distinguish +# a failure to support a valid configuration from a meaningless +# configuration. + +# The goal of this file is to map all the various variations of a given +# machine specification into a single specification in the form: +# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM +# or in some cases, the newer four-part form: +# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM +# It is wrong to echo any other type of specification. + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] CPU-MFR-OPSYS + $0 [OPTION] ALIAS + +Canonicalize a configuration name. + +Operation modes: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.sub ($timestamp) + +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001 +Free Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try \`$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit 0 ;; + --version | -v ) + echo "$version" ; exit 0 ;; + --help | --h* | -h ) + echo "$usage"; exit 0 ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" + exit 1 ;; + + *local*) + # First pass through any local machine types. + echo $1 + exit 0;; + + * ) + break ;; + esac +done + +case $# in + 0) echo "$me: missing argument$help" >&2 + exit 1;; + 1) ;; + *) echo "$me: too many arguments$help" >&2 + exit 1;; +esac + +# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). +# Here we must recognize all the valid KERNEL-OS combinations. +maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` +case $maybe_os in + nto-qnx* | linux-gnu* | storm-chaos* | os2-emx*) + os=-$maybe_os + basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` + ;; + *) + basic_machine=`echo $1 | sed 's/-[^-]*$//'` + if [ $basic_machine != $1 ] + then os=`echo $1 | sed 's/.*-/-/'` + else os=; fi + ;; +esac + +### Let's recognize common machines as not being operating systems so +### that things like config.sub decstation-3100 work. We also +### recognize some manufacturers as not being operating systems, so we +### can provide default operating systems below. +case $os in + -sun*os*) + # Prevent following clause from handling this invalid input. + ;; + -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ + -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ + -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ + -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ + -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ + -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ + -apple | -axis) + os= + basic_machine=$1 + ;; + -sim | -cisco | -oki | -wec | -winbond) + os= + basic_machine=$1 + ;; + -scout) + ;; + -wrs) + os=-vxworks + basic_machine=$1 + ;; + -hiux*) + os=-hiuxwe2 + ;; + -sco5) + os=-sco3.2v5 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco4) + os=-sco3.2v4 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco3.2.[4-9]*) + os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco3.2v[4-9]*) + # Don't forget version if it is 3.2v4 or newer. + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco*) + os=-sco3.2v2 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -udk*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -isc) + os=-isc2.2 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -clix*) + basic_machine=clipper-intergraph + ;; + -isc*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -lynx*) + os=-lynxos + ;; + -ptx*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` + ;; + -windowsnt*) + os=`echo $os | sed -e 's/windowsnt/winnt/'` + ;; + -psos*) + os=-psos + ;; + -mint | -mint[0-9]*) + basic_machine=m68k-atari + os=-mint + ;; +esac + +# Decode aliases for certain CPU-COMPANY combinations. +case $basic_machine in + # Recognize the basic CPU types without company name. + # Some are omitted here because they have special meanings below. + tahoe | i860 | ia64 | m32r | m68k | m68000 | m88k | ns32k | arc \ + | arm | arme[lb] | arm[bl]e | armv[2345] | armv[345][lb] | strongarm | xscale \ + | pyramid | mn10200 | mn10300 | tron | a29k \ + | 580 | i960 | h8300 \ + | x86 | ppcbe | mipsbe | mipsle | shbe | shle \ + | hppa | hppa1.0 | hppa1.1 | hppa2.0 | hppa2.0w | hppa2.0n \ + | hppa64 \ + | alpha | alphaev[4-8] | alphaev56 | alphapca5[67] \ + | alphaev6[78] \ + | we32k | ns16k | clipper | i370 | sh | sh[34] \ + | powerpc | powerpcle \ + | 1750a | dsp16xx | pdp10 | pdp11 \ + | mips16 | mips64 | mipsel | mips64el \ + | mips64orion | mips64orionel | mipstx39 | mipstx39el \ + | mips64vr4300 | mips64vr4300el | mips64vr4100 | mips64vr4100el \ + | mips64vr5000 | miprs64vr5000el | mcore | s390 | s390x \ + | sparc | sparclet | sparclite | sparc64 | sparcv9 | sparcv9b \ + | v850 | c4x \ + | thumb | d10v | d30v | fr30 | avr | openrisc | tic80 \ + | pj | pjl | h8500) + basic_machine=$basic_machine-unknown + ;; + m6811 | m68hc11 | m6812 | m68hc12) + # Motorola 68HC11/12. + basic_machine=$basic_machine-unknown + os=-none + ;; + m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | z8k | v70 | w65) + ;; + + # We use `pc' rather than `unknown' + # because (1) that's what they normally are, and + # (2) the word "unknown" tends to confuse beginning users. + i*86 | x86_64) + basic_machine=$basic_machine-pc + ;; + # Object if more than one company name word. + *-*-*) + echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + exit 1 + ;; + # Recognize the basic CPU types with company name. + # FIXME: clean up the formatting here. + vax-* | tahoe-* | i*86-* | i860-* | ia64-* | m32r-* | m68k-* | m68000-* \ + | m88k-* | sparc-* | ns32k-* | fx80-* | arc-* | c[123]* \ + | arm-* | armbe-* | armle-* | armv*-* | strongarm-* | xscale-* \ + | mips-* | pyramid-* | tron-* | a29k-* | romp-* | rs6000-* \ + | power-* | none-* | 580-* | cray2-* | h8300-* | h8500-* | i960-* \ + | xmp-* | ymp-* \ + | x86-* | ppcbe-* | mipsbe-* | mipsle-* | shbe-* | shle-* \ + | hppa-* | hppa1.0-* | hppa1.1-* | hppa2.0-* | hppa2.0w-* \ + | hppa2.0n-* | hppa64-* \ + | alpha-* | alphaev[4-8]-* | alphaev56-* | alphapca5[67]-* \ + | alphaev6[78]-* \ + | we32k-* | cydra-* | ns16k-* | pn-* | np1-* | xps100-* \ + | clipper-* | orion-* \ + | sparclite-* | pdp10-* | pdp11-* | sh-* | powerpc-* | powerpcle-* \ + | sparc64-* | sparcv9-* | sparcv9b-* | sparc86x-* \ + | mips16-* | mips64-* | mipsel-* \ + | mips64el-* | mips64orion-* | mips64orionel-* \ + | mips64vr4100-* | mips64vr4100el-* | mips64vr4300-* | mips64vr4300el-* \ + | mipstx39-* | mipstx39el-* | mcore-* \ + | f30[01]-* | f700-* | s390-* | s390x-* | sv1-* | t3e-* \ + | [cjt]90-* \ + | m88110-* | m680[01234]0-* | m683?2-* | m68360-* | z8k-* | d10v-* \ + | thumb-* | v850-* | d30v-* | tic30-* | tic80-* | c30-* | fr30-* \ + | bs2000-* | tic54x-* | c54x-* | x86_64-* | pj-* | pjl-*) + ;; + # Recognize the various machine names and aliases which stand + # for a CPU type and a company and sometimes even an OS. + 386bsd) + basic_machine=i386-unknown + os=-bsd + ;; + 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) + basic_machine=m68000-att + ;; + 3b*) + basic_machine=we32k-att + ;; + a29khif) + basic_machine=a29k-amd + os=-udi + ;; + adobe68k) + basic_machine=m68010-adobe + os=-scout + ;; + alliant | fx80) + basic_machine=fx80-alliant + ;; + altos | altos3068) + basic_machine=m68k-altos + ;; + am29k) + basic_machine=a29k-none + os=-bsd + ;; + amdahl) + basic_machine=580-amdahl + os=-sysv + ;; + amiga | amiga-*) + basic_machine=m68k-unknown + ;; + amigaos | amigados) + basic_machine=m68k-unknown + os=-amigaos + ;; + amigaunix | amix) + basic_machine=m68k-unknown + os=-sysv4 + ;; + apollo68) + basic_machine=m68k-apollo + os=-sysv + ;; + apollo68bsd) + basic_machine=m68k-apollo + os=-bsd + ;; + aux) + basic_machine=m68k-apple + os=-aux + ;; + balance) + basic_machine=ns32k-sequent + os=-dynix + ;; + convex-c1) + basic_machine=c1-convex + os=-bsd + ;; + convex-c2) + basic_machine=c2-convex + os=-bsd + ;; + convex-c32) + basic_machine=c32-convex + os=-bsd + ;; + convex-c34) + basic_machine=c34-convex + os=-bsd + ;; + convex-c38) + basic_machine=c38-convex + os=-bsd + ;; + cray | ymp) + basic_machine=ymp-cray + os=-unicos + ;; + cray2) + basic_machine=cray2-cray + os=-unicos + ;; + [cjt]90) + basic_machine=${basic_machine}-cray + os=-unicos + ;; + crds | unos) + basic_machine=m68k-crds + ;; + cris | cris-* | etrax*) + basic_machine=cris-axis + ;; + da30 | da30-*) + basic_machine=m68k-da30 + ;; + decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) + basic_machine=mips-dec + ;; + delta | 3300 | motorola-3300 | motorola-delta \ + | 3300-motorola | delta-motorola) + basic_machine=m68k-motorola + ;; + delta88) + basic_machine=m88k-motorola + os=-sysv3 + ;; + dpx20 | dpx20-*) + basic_machine=rs6000-bull + os=-bosx + ;; + dpx2* | dpx2*-bull) + basic_machine=m68k-bull + os=-sysv3 + ;; + ebmon29k) + basic_machine=a29k-amd + os=-ebmon + ;; + elxsi) + basic_machine=elxsi-elxsi + os=-bsd + ;; + encore | umax | mmax) + basic_machine=ns32k-encore + ;; + es1800 | OSE68k | ose68k | ose | OSE) + basic_machine=m68k-ericsson + os=-ose + ;; + fx2800) + basic_machine=i860-alliant + ;; + genix) + basic_machine=ns32k-ns + ;; + gmicro) + basic_machine=tron-gmicro + os=-sysv + ;; + go32) + basic_machine=i386-pc + os=-go32 + ;; + h3050r* | hiux*) + basic_machine=hppa1.1-hitachi + os=-hiuxwe2 + ;; + h8300hms) + basic_machine=h8300-hitachi + os=-hms + ;; + h8300xray) + basic_machine=h8300-hitachi + os=-xray + ;; + h8500hms) + basic_machine=h8500-hitachi + os=-hms + ;; + harris) + basic_machine=m88k-harris + os=-sysv3 + ;; + hp300-*) + basic_machine=m68k-hp + ;; + hp300bsd) + basic_machine=m68k-hp + os=-bsd + ;; + hp300hpux) + basic_machine=m68k-hp + os=-hpux + ;; + hp3k9[0-9][0-9] | hp9[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hp9k2[0-9][0-9] | hp9k31[0-9]) + basic_machine=m68000-hp + ;; + hp9k3[2-9][0-9]) + basic_machine=m68k-hp + ;; + hp9k6[0-9][0-9] | hp6[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hp9k7[0-79][0-9] | hp7[0-79][0-9]) + basic_machine=hppa1.1-hp + ;; + hp9k78[0-9] | hp78[0-9]) + # FIXME: really hppa2.0-hp + basic_machine=hppa1.1-hp + ;; + hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) + # FIXME: really hppa2.0-hp + basic_machine=hppa1.1-hp + ;; + hp9k8[0-9][13679] | hp8[0-9][13679]) + basic_machine=hppa1.1-hp + ;; + hp9k8[0-9][0-9] | hp8[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hppa-next) + os=-nextstep3 + ;; + hppaosf) + basic_machine=hppa1.1-hp + os=-osf + ;; + hppro) + basic_machine=hppa1.1-hp + os=-proelf + ;; + i370-ibm* | ibm*) + basic_machine=i370-ibm + ;; +# I'm not sure what "Sysv32" means. Should this be sysv3.2? + i*86v32) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv32 + ;; + i*86v4*) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv4 + ;; + i*86v) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv + ;; + i*86sol2) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-solaris2 + ;; + i386mach) + basic_machine=i386-mach + os=-mach + ;; + i386-vsta | vsta) + basic_machine=i386-unknown + os=-vsta + ;; + iris | iris4d) + basic_machine=mips-sgi + case $os in + -irix*) + ;; + *) + os=-irix4 + ;; + esac + ;; + isi68 | isi) + basic_machine=m68k-isi + os=-sysv + ;; + m88k-omron*) + basic_machine=m88k-omron + ;; + magnum | m3230) + basic_machine=mips-mips + os=-sysv + ;; + merlin) + basic_machine=ns32k-utek + os=-sysv + ;; + mingw32) + basic_machine=i386-pc + os=-mingw32 + ;; + miniframe) + basic_machine=m68000-convergent + ;; + *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*) + basic_machine=m68k-atari + os=-mint + ;; + mipsel*-linux*) + basic_machine=mipsel-unknown + os=-linux-gnu + ;; + mips*-linux*) + basic_machine=mips-unknown + os=-linux-gnu + ;; + mips3*-*) + basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` + ;; + mips3*) + basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown + ;; + mmix*) + basic_machine=mmix-knuth + os=-mmixware + ;; + monitor) + basic_machine=m68k-rom68k + os=-coff + ;; + msdos) + basic_machine=i386-pc + os=-msdos + ;; + mvs) + basic_machine=i370-ibm + os=-mvs + ;; + ncr3000) + basic_machine=i486-ncr + os=-sysv4 + ;; + netbsd386) + basic_machine=i386-unknown + os=-netbsd + ;; + netwinder) + basic_machine=armv4l-rebel + os=-linux + ;; + news | news700 | news800 | news900) + basic_machine=m68k-sony + os=-newsos + ;; + news1000) + basic_machine=m68030-sony + os=-newsos + ;; + news-3600 | risc-news) + basic_machine=mips-sony + os=-newsos + ;; + necv70) + basic_machine=v70-nec + os=-sysv + ;; + next | m*-next ) + basic_machine=m68k-next + case $os in + -nextstep* ) + ;; + -ns2*) + os=-nextstep2 + ;; + *) + os=-nextstep3 + ;; + esac + ;; + nh3000) + basic_machine=m68k-harris + os=-cxux + ;; + nh[45]000) + basic_machine=m88k-harris + os=-cxux + ;; + nindy960) + basic_machine=i960-intel + os=-nindy + ;; + mon960) + basic_machine=i960-intel + os=-mon960 + ;; + nonstopux) + basic_machine=mips-compaq + os=-nonstopux + ;; + np1) + basic_machine=np1-gould + ;; + nsr-tandem) + basic_machine=nsr-tandem + ;; + op50n-* | op60c-*) + basic_machine=hppa1.1-oki + os=-proelf + ;; + OSE68000 | ose68000) + basic_machine=m68000-ericsson + os=-ose + ;; + os68k) + basic_machine=m68k-none + os=-os68k + ;; + pa-hitachi) + basic_machine=hppa1.1-hitachi + os=-hiuxwe2 + ;; + paragon) + basic_machine=i860-intel + os=-osf + ;; + pbd) + basic_machine=sparc-tti + ;; + pbb) + basic_machine=m68k-tti + ;; + pc532 | pc532-*) + basic_machine=ns32k-pc532 + ;; + pentium | p5 | k5 | k6 | nexgen) + basic_machine=i586-pc + ;; + pentiumpro | p6 | 6x86 | athlon) + basic_machine=i686-pc + ;; + pentiumii | pentium2) + basic_machine=i686-pc + ;; + pentium-* | p5-* | k5-* | k6-* | nexgen-*) + basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentiumpro-* | p6-* | 6x86-* | athlon-*) + basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentiumii-* | pentium2-*) + basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pn) + basic_machine=pn-gould + ;; + power) basic_machine=power-ibm + ;; + ppc) basic_machine=powerpc-unknown + ;; + ppc-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ppcle | powerpclittle | ppc-le | powerpc-little) + basic_machine=powerpcle-unknown + ;; + ppcle-* | powerpclittle-*) + basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ps2) + basic_machine=i386-ibm + ;; + pw32) + basic_machine=i586-unknown + os=-pw32 + ;; + rom68k) + basic_machine=m68k-rom68k + os=-coff + ;; + rm[46]00) + basic_machine=mips-siemens + ;; + rtpc | rtpc-*) + basic_machine=romp-ibm + ;; + sa29200) + basic_machine=a29k-amd + os=-udi + ;; + sequent) + basic_machine=i386-sequent + ;; + sh) + basic_machine=sh-hitachi + os=-hms + ;; + sparclite-wrs) + basic_machine=sparclite-wrs + os=-vxworks + ;; + sps7) + basic_machine=m68k-bull + os=-sysv2 + ;; + spur) + basic_machine=spur-unknown + ;; + st2000) + basic_machine=m68k-tandem + ;; + stratus) + basic_machine=i860-stratus + os=-sysv4 + ;; + sun2) + basic_machine=m68000-sun + ;; + sun2os3) + basic_machine=m68000-sun + os=-sunos3 + ;; + sun2os4) + basic_machine=m68000-sun + os=-sunos4 + ;; + sun3os3) + basic_machine=m68k-sun + os=-sunos3 + ;; + sun3os4) + basic_machine=m68k-sun + os=-sunos4 + ;; + sun4os3) + basic_machine=sparc-sun + os=-sunos3 + ;; + sun4os4) + basic_machine=sparc-sun + os=-sunos4 + ;; + sun4sol2) + basic_machine=sparc-sun + os=-solaris2 + ;; + sun3 | sun3-*) + basic_machine=m68k-sun + ;; + sun4) + basic_machine=sparc-sun + ;; + sun386 | sun386i | roadrunner) + basic_machine=i386-sun + ;; + sv1) + basic_machine=sv1-cray + os=-unicos + ;; + symmetry) + basic_machine=i386-sequent + os=-dynix + ;; + t3e) + basic_machine=t3e-cray + os=-unicos + ;; + tic54x | c54x*) + basic_machine=tic54x-unknown + os=-coff + ;; + tx39) + basic_machine=mipstx39-unknown + ;; + tx39el) + basic_machine=mipstx39el-unknown + ;; + tower | tower-32) + basic_machine=m68k-ncr + ;; + udi29k) + basic_machine=a29k-amd + os=-udi + ;; + ultra3) + basic_machine=a29k-nyu + os=-sym1 + ;; + v810 | necv810) + basic_machine=v810-nec + os=-none + ;; + vaxv) + basic_machine=vax-dec + os=-sysv + ;; + vms) + basic_machine=vax-dec + os=-vms + ;; + vpp*|vx|vx-*) + basic_machine=f301-fujitsu + ;; + vxworks960) + basic_machine=i960-wrs + os=-vxworks + ;; + vxworks68) + basic_machine=m68k-wrs + os=-vxworks + ;; + vxworks29k) + basic_machine=a29k-wrs + os=-vxworks + ;; + w65*) + basic_machine=w65-wdc + os=-none + ;; + w89k-*) + basic_machine=hppa1.1-winbond + os=-proelf + ;; + xmp) + basic_machine=xmp-cray + os=-unicos + ;; + xps | xps100) + basic_machine=xps100-honeywell + ;; + z8k-*-coff) + basic_machine=z8k-unknown + os=-sim + ;; + none) + basic_machine=none-none + os=-none + ;; + +# Here we handle the default manufacturer of certain CPU types. It is in +# some cases the only manufacturer, in others, it is the most popular. + w89k) + basic_machine=hppa1.1-winbond + ;; + op50n) + basic_machine=hppa1.1-oki + ;; + op60c) + basic_machine=hppa1.1-oki + ;; + mips) + if [ x$os = x-linux-gnu ]; then + basic_machine=mips-unknown + else + basic_machine=mips-mips + fi + ;; + romp) + basic_machine=romp-ibm + ;; + rs6000) + basic_machine=rs6000-ibm + ;; + vax) + basic_machine=vax-dec + ;; + pdp10) + # there are many clones, so DEC is not a safe bet + basic_machine=pdp10-unknown + ;; + pdp11) + basic_machine=pdp11-dec + ;; + we32k) + basic_machine=we32k-att + ;; + sh3 | sh4) + basic_machine=sh-unknown + ;; + sparc | sparcv9 | sparcv9b) + basic_machine=sparc-sun + ;; + cydra) + basic_machine=cydra-cydrome + ;; + orion) + basic_machine=orion-highlevel + ;; + orion105) + basic_machine=clipper-highlevel + ;; + mac | mpw | mac-mpw) + basic_machine=m68k-apple + ;; + pmac | pmac-mpw) + basic_machine=powerpc-apple + ;; + c4x*) + basic_machine=c4x-none + os=-coff + ;; + *-unknown) + # Make sure to match an already-canonicalized machine name. + ;; + *) + echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + exit 1 + ;; +esac + +# Here we canonicalize certain aliases for manufacturers. +case $basic_machine in + *-digital*) + basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` + ;; + *-commodore*) + basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` + ;; + *) + ;; +esac + +# Decode manufacturer-specific aliases for certain operating systems. + +if [ x"$os" != x"" ] +then +case $os in + # First match some system type aliases + # that might get confused with valid system types. + # -solaris* is a basic system type, with this one exception. + -solaris1 | -solaris1.*) + os=`echo $os | sed -e 's|solaris1|sunos4|'` + ;; + -solaris) + os=-solaris2 + ;; + -svr4*) + os=-sysv4 + ;; + -unixware*) + os=-sysv4.2uw + ;; + -gnu/linux*) + os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` + ;; + # First accept the basic system types. + # The portable systems comes first. + # Each alternative MUST END IN A *, to match a version number. + # -sysv* is not here because it comes later, after sysvr4. + -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ + | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ + | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \ + | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ + | -aos* \ + | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ + | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ + | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \ + | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ + | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ + | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ + | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ + | -mingw32* | -linux-gnu* | -uxpv* | -beos* | -mpeix* | -udk* \ + | -interix* | -uwin* | -rhapsody* | -darwin* | -opened* \ + | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ + | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* | -os2*) + # Remember, each alternative MUST END IN *, to match a version number. + ;; + -qnx*) + case $basic_machine in + x86-* | i*86-*) + ;; + *) + os=-nto$os + ;; + esac + ;; + -nto*) + os=-nto-qnx + ;; + -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ + | -windows* | -osx | -abug | -netware* | -os9* | -beos* \ + | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) + ;; + -mac*) + os=`echo $os | sed -e 's|mac|macos|'` + ;; + -linux*) + os=`echo $os | sed -e 's|linux|linux-gnu|'` + ;; + -sunos5*) + os=`echo $os | sed -e 's|sunos5|solaris2|'` + ;; + -sunos6*) + os=`echo $os | sed -e 's|sunos6|solaris3|'` + ;; + -opened*) + os=-openedition + ;; + -wince*) + os=-wince + ;; + -osfrose*) + os=-osfrose + ;; + -osf*) + os=-osf + ;; + -utek*) + os=-bsd + ;; + -dynix*) + os=-bsd + ;; + -acis*) + os=-aos + ;; + -386bsd) + os=-bsd + ;; + -ctix* | -uts*) + os=-sysv + ;; + -ns2 ) + os=-nextstep2 + ;; + -nsk*) + os=-nsk + ;; + # Preserve the version number of sinix5. + -sinix5.*) + os=`echo $os | sed -e 's|sinix|sysv|'` + ;; + -sinix*) + os=-sysv4 + ;; + -triton*) + os=-sysv3 + ;; + -oss*) + os=-sysv3 + ;; + -svr4) + os=-sysv4 + ;; + -svr3) + os=-sysv3 + ;; + -sysvr4) + os=-sysv4 + ;; + # This must come after -sysvr4. + -sysv*) + ;; + -ose*) + os=-ose + ;; + -es1800*) + os=-ose + ;; + -xenix) + os=-xenix + ;; + -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) + os=-mint + ;; + -none) + ;; + *) + # Get rid of the `-' at the beginning of $os. + os=`echo $os | sed 's/[^-]*-//'` + echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 + exit 1 + ;; +esac +else + +# Here we handle the default operating systems that come with various machines. +# The value should be what the vendor currently ships out the door with their +# machine or put another way, the most popular os provided with the machine. + +# Note that if you're going to try to match "-MANUFACTURER" here (say, +# "-sun"), then you have to tell the case statement up towards the top +# that MANUFACTURER isn't an operating system. Otherwise, code above +# will signal an error saying that MANUFACTURER isn't an operating +# system, and we'll never get to this point. + +case $basic_machine in + *-acorn) + os=-riscix1.2 + ;; + arm*-rebel) + os=-linux + ;; + arm*-semi) + os=-aout + ;; + pdp10-*) + os=-tops20 + ;; + pdp11-*) + os=-none + ;; + *-dec | vax-*) + os=-ultrix4.2 + ;; + m68*-apollo) + os=-domain + ;; + i386-sun) + os=-sunos4.0.2 + ;; + m68000-sun) + os=-sunos3 + # This also exists in the configure program, but was not the + # default. + # os=-sunos4 + ;; + m68*-cisco) + os=-aout + ;; + mips*-cisco) + os=-elf + ;; + mips*-*) + os=-elf + ;; + *-tti) # must be before sparc entry or we get the wrong os. + os=-sysv3 + ;; + sparc-* | *-sun) + os=-sunos4.1.1 + ;; + *-be) + os=-beos + ;; + *-ibm) + os=-aix + ;; + *-wec) + os=-proelf + ;; + *-winbond) + os=-proelf + ;; + *-oki) + os=-proelf + ;; + *-hp) + os=-hpux + ;; + *-hitachi) + os=-hiux + ;; + i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) + os=-sysv + ;; + *-cbm) + os=-amigaos + ;; + *-dg) + os=-dgux + ;; + *-dolphin) + os=-sysv3 + ;; + m68k-ccur) + os=-rtu + ;; + m88k-omron*) + os=-luna + ;; + *-next ) + os=-nextstep + ;; + *-sequent) + os=-ptx + ;; + *-crds) + os=-unos + ;; + *-ns) + os=-genix + ;; + i370-*) + os=-mvs + ;; + *-next) + os=-nextstep3 + ;; + *-gould) + os=-sysv + ;; + *-highlevel) + os=-bsd + ;; + *-encore) + os=-bsd + ;; + *-sgi) + os=-irix + ;; + *-siemens) + os=-sysv4 + ;; + *-masscomp) + os=-rtu + ;; + f30[01]-fujitsu | f700-fujitsu) + os=-uxpv + ;; + *-rom68k) + os=-coff + ;; + *-*bug) + os=-coff + ;; + *-apple) + os=-macos + ;; + *-atari*) + os=-mint + ;; + *) + os=-none + ;; +esac +fi + +# Here we handle the case where we know the os, and the CPU type, but not the +# manufacturer. We pick the logical manufacturer. +vendor=unknown +case $basic_machine in + *-unknown) + case $os in + -riscix*) + vendor=acorn + ;; + -sunos*) + vendor=sun + ;; + -aix*) + vendor=ibm + ;; + -beos*) + vendor=be + ;; + -hpux*) + vendor=hp + ;; + -mpeix*) + vendor=hp + ;; + -hiux*) + vendor=hitachi + ;; + -unos*) + vendor=crds + ;; + -dgux*) + vendor=dg + ;; + -luna*) + vendor=omron + ;; + -genix*) + vendor=ns + ;; + -mvs* | -opened*) + vendor=ibm + ;; + -ptx*) + vendor=sequent + ;; + -vxsim* | -vxworks*) + vendor=wrs + ;; + -aux*) + vendor=apple + ;; + -hms*) + vendor=hitachi + ;; + -mpw* | -macos*) + vendor=apple + ;; + -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) + vendor=atari + ;; + esac + basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` + ;; +esac + +echo $basic_machine$os +exit 0 + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/libfec/configure.in b/libfec/configure.in new file mode 100644 index 0000000..10b5380 --- /dev/null +++ b/libfec/configure.in @@ -0,0 +1,90 @@ +dnl Process this file with autoconf to produce a configure script. +AC_INIT(viterbi27.c) +AC_CONFIG_HEADER(config.h) +SO_NAME=3 +VERSION=3.0.0 +AC_SUBST(SO_NAME) +AC_SUBST(VERSION) + +dnl Checks for programs. +AC_PROG_CC +if test $GCC != "yes" +then + AC_MSG_ERROR([Need GNU C compiler]) +fi +dnl Checks for libraries. +AC_CHECK_LIB(c, malloc) + +dnl Checks for header files. +AC_CHECK_HEADERS(getopt.h stdio.h stdlib.h memory.h string.h) +if test -z "$HAVE_stdio.h" +then + AC_MSG_ERROR([Need stdio.h!]) +fi +if test -z "$HAVE_stdlib.h" +then + AC_MSG_ERROR([Need stdlib.h!]) +fi +if test -z "$HAVE_stdlib.h" +then + AC_MSG_ERROR([Need memory.h!]) +fi +if test -z "$HAVE_string.h" +then + AC_MSG_ERROR([Need string.h]) +fi + +AC_CANONICAL_SYSTEM +case $target_cpu in +x86_64) + ARCH_OPTION="-msse2" + MLIBS="dotprod_port.o \ + peakval_port.o \ + sumsq.o sumsq_port.o \ + cpu_mode_x86_64.o" + ;; +i386|i486|i586|i686) + ARCH_OPTION="-march=$target_cpu" + MLIBS="viterbi27_mmx.o mmxbfly27.o viterbi27_sse.o ssebfly27.o viterbi27_sse2.o sse2bfly27.o \ + viterbi29_mmx.o mmxbfly29.o viterbi29_sse.o ssebfly29.o viterbi29_sse2.o sse2bfly29.o \ + viterbi39_sse2.o viterbi39_sse.o viterbi39_mmx.o \ + viterbi615_mmx.o viterbi615_sse.o viterbi615_sse2.o \ + dotprod_mmx.o dotprod_mmx_assist.o \ + dotprod_sse2.o dotprod_sse2_assist.o \ + peakval_mmx.o peakval_mmx_assist.o \ + peakval_sse.o peakval_sse_assist.o \ + peakval_sse2.o peakval_sse2_assist.o \ + sumsq.o sumsq_port.o \ + sumsq_sse2.o sumsq_sse2_assist.o \ + sumsq_mmx.o sumsq_mmx_assist.o \ + cpu_features.o cpu_mode_x86.o" + ;; +powerpc*) + ARCH_OPTION="-fno-common -faltivec" + MLIBS="viterbi27_av.o viterbi29_av.o viterbi39_av.o viterbi615_av.o \ + encode_rs_av.o \ + dotprod_av.o sumsq_av.o peakval_av.o cpu_mode_ppc.o" + ;; +*) + MLIBS="cpu_mode_generic.o" +esac +case $target_os in +darwin*) + SH_LIB=libfec.dylib + REBIND="" + ;; +*) + SH_LIB=libfec.so + REBIND=ldconfig + ;; +esac +AC_SUBST(SH_LIB) +AC_SUBST(REBIND) +AC_SUBST(MLIBS) +AC_SUBST(ARCH_OPTION) + + +dnl Checks for library functions. +AC_CHECK_FUNCS(getopt_long memset memmove) + +AC_OUTPUT(makefile) diff --git a/libfec/cpu_features.s b/libfec/cpu_features.s new file mode 100644 index 0000000..ef4ba4e --- /dev/null +++ b/libfec/cpu_features.s @@ -0,0 +1,15 @@ +.text +.global cpu_features + .type cpu_features,@function +cpu_features: + pushl %ebx + pushl %ecx + pushl %edx + movl $1,%eax + cpuid + movl %edx,%eax + popl %edx + popl %ecx + popl %ebx + ret + \ No newline at end of file diff --git a/libfec/cpu_mode_generic.c b/libfec/cpu_mode_generic.c new file mode 100644 index 0000000..500f995 --- /dev/null +++ b/libfec/cpu_mode_generic.c @@ -0,0 +1,13 @@ +/* Determine CPU support for SIMD on Power PC + * Copyright 2004 Phil Karn, KA9Q + * Copyright 2014 Matthias P. Braendli, HB9EGM + */ +#include +#include "fec.h" + +enum cpu_mode Cpu_mode; + +// Use the portable code for this unknown CPU +void find_cpu_mode(void) { + Cpu_mode = PORT; +} diff --git a/libfec/cpu_mode_ppc.c b/libfec/cpu_mode_ppc.c new file mode 100644 index 0000000..0071558 --- /dev/null +++ b/libfec/cpu_mode_ppc.c @@ -0,0 +1,40 @@ +/* Determine CPU support for SIMD on Power PC + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include "fec.h" +#ifdef __VEC__ +#include +#endif + +/* Various SIMD instruction set names */ +char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)", + "x86 Streaming SIMD Extensions (SSE)", + "x86 Streaming SIMD Extensions 2 (SSE2)", + "PowerPC G4/G5 Altivec/Velocity Engine"}; + +enum cpu_mode Cpu_mode; + +void find_cpu_mode(void){ + + if(Cpu_mode != UNKNOWN) + return; + +#ifdef __VEC__ + { + /* Ask the OS if we have Altivec support */ + int selectors[2] = { CTL_HW, HW_VECTORUNIT }; + int hasVectorUnit = 0; + size_t length = sizeof(hasVectorUnit); + int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); + if(0 == error && hasVectorUnit) + Cpu_mode = ALTIVEC; + else + Cpu_mode = PORT; + } +#else + Cpu_mode = PORT; +#endif + + fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]); +} diff --git a/libfec/cpu_mode_x86.c b/libfec/cpu_mode_x86.c new file mode 100644 index 0000000..322018e --- /dev/null +++ b/libfec/cpu_mode_x86.c @@ -0,0 +1,33 @@ +/* Determine CPU support for SIMD + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include "fec.h" + +/* Various SIMD instruction set names */ +char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)", + "x86 Streaming SIMD Extensions (SSE)", + "x86 Streaming SIMD Extensions 2 (SSE2)", + "PowerPC G4/G5 Altivec/Velocity Engine"}; + +enum cpu_mode Cpu_mode; + +void find_cpu_mode(void){ + + int f; + if(Cpu_mode != UNKNOWN) + return; + + /* Figure out what kind of CPU we have */ + f = cpu_features(); + if(f & (1<<26)){ /* SSE2 is present */ + Cpu_mode = SSE2; + } else if(f & (1<<25)){ /* SSE is present */ + Cpu_mode = SSE; + } else if(f & (1<<23)){ /* MMX is present */ + Cpu_mode = MMX; + } else { /* No SIMD at all */ + Cpu_mode = PORT; + } + fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]); +} diff --git a/libfec/cpu_mode_x86_64.c b/libfec/cpu_mode_x86_64.c new file mode 100644 index 0000000..758096a --- /dev/null +++ b/libfec/cpu_mode_x86_64.c @@ -0,0 +1,27 @@ +/* Determine CPU support for SIMD + * Copyright 2004 Phil Karn, KA9Q + * + * Modified in 2012 by Matthias P. Braendli, HB9EGM + */ +#include +#include "fec.h" + +/* Various SIMD instruction set names */ +char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)", + "x86 Streaming SIMD Extensions (SSE)", + "x86 Streaming SIMD Extensions 2 (SSE2)", + "PowerPC G4/G5 Altivec/Velocity Engine"}; + +enum cpu_mode Cpu_mode; + +void find_cpu_mode(void){ + + int f; + if(Cpu_mode != UNKNOWN) + return; + + /* According to the wikipedia entry x86-64, all x86-64 processors have SSE2 */ + /* The same assumption is also in other source files ! */ + Cpu_mode = SSE2; + fprintf(stderr,"CPU: x86-64, using portable C implementation\n"); +} diff --git a/libfec/decode_rs.c b/libfec/decode_rs.c new file mode 100644 index 0000000..d7f97b3 --- /dev/null +++ b/libfec/decode_rs.c @@ -0,0 +1,262 @@ +/* Reed-Solomon decoder + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifdef DEBUG +#include +#endif + +#include + +#define NULL ((void *)0) +#define min(a,b) ((a) < (b) ? (a) : (b)) + +#ifdef FIXED +#include "fixed.h" +#elif defined(BIGSYM) +#include "int.h" +#else +#include "char.h" +#endif + +int DECODE_RS( +#ifdef FIXED +data_t *data, int *eras_pos, int no_eras,int pad){ +#else +void *p,data_t *data, int *eras_pos, int no_eras){ + struct rs *rs = (struct rs *)p; +#endif + int deg_lambda, el, deg_omega; + int i, j, r,k; + data_t u,q,tmp,num1,num2,den,discr_r; + data_t lambda[NROOTS+1], s[NROOTS]; /* Err+Eras Locator poly + * and syndrome poly */ + data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1]; + data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS]; + int syn_error, count; + +#ifdef FIXED + /* Check pad parameter for validity */ + if(pad < 0 || pad >= NN) + return -1; +#endif + + /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */ + for(i=0;i 0) { + /* Init lambda to be the erasure locator polynomial */ + lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))]; + for (i = 1; i < no_eras; i++) { + u = MODNN(PRIM*(NN-1-eras_pos[i])); + for (j = i+1; j > 0; j--) { + tmp = INDEX_OF[lambda[j - 1]]; + if(tmp != A0) + lambda[j] ^= ALPHA_TO[MODNN(u + tmp)]; + } + } + +#if DEBUG >= 1 + /* Test code that verifies the erasure locator polynomial just constructed + Needed only for decoder debugging. */ + + /* find roots of the erasure location polynomial */ + for(i=1;i<=no_eras;i++) + reg[i] = INDEX_OF[lambda[i]]; + + count = 0; + for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) { + q = 1; + for (j = 1; j <= no_eras; j++) + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + if (q != 0) + continue; + /* store root and error location number indices */ + root[count] = i; + loc[count] = k; + count++; + } + if (count != no_eras) { + printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras); + count = -1; + goto finish; + } +#if DEBUG >= 2 + printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n"); + for (i = 0; i < count; i++) + printf("%d ", loc[i]); + printf("\n"); +#endif +#endif + } + for(i=0;i 0; j--){ + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + } + if (q != 0) + continue; /* Not a root */ + /* store root (index-form) and error location number */ +#if DEBUG>=2 + printf("count %d root %d loc %d\n",count,i,k); +#endif + root[count] = i; + loc[count] = k; + /* If we've already found max possible roots, + * abort the search to save time + */ + if(++count == deg_lambda) + break; + } + if (deg_lambda != count) { + /* + * deg(lambda) unequal to number of roots => uncorrectable + * error detected + */ + count = -1; + goto finish; + } + /* + * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo + * x**NROOTS). in index form. Also find deg(omega). + */ + deg_omega = deg_lambda-1; + for (i = 0; i <= deg_omega;i++){ + tmp = 0; + for(j=i;j >= 0; j--){ + if ((s[i - j] != A0) && (lambda[j] != A0)) + tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])]; + } + omega[i] = INDEX_OF[tmp]; + } + + /* + * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 = + * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form + */ + for (j = count-1; j >=0; j--) { + num1 = 0; + for (i = deg_omega; i >= 0; i--) { + if (omega[i] != A0) + num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])]; + } + num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)]; + den = 0; + + /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */ + for (i = min(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) { + if(lambda[i+1] != A0) + den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])]; + } +#if DEBUG >= 1 + if (den == 0) { + printf("\n ERROR: denominator = 0\n"); + count = -1; + goto finish; + } +#endif + /* Apply error to data */ + if (num1 != 0 && loc[j] >= PAD) { + data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])]; + } + } + finish: + if(eras_pos != NULL){ + for(i=0;i) must be included by the calling + * program. + */ + + +#if !defined(NROOTS) +#error "NROOTS not defined" +#endif + +#if !defined(NN) +#error "NN not defined" +#endif + +#if !defined(PAD) +#error "PAD not defined" +#endif + +#if !defined(ALPHA_TO) +#error "ALPHA_TO not defined" +#endif + +#if !defined(INDEX_OF) +#error "INDEX_OF not defined" +#endif + +#if !defined(MODNN) +#error "MODNN not defined" +#endif + +#if !defined(FCR) +#error "FCR not defined" +#endif + +#if !defined(PRIM) +#error "PRIM not defined" +#endif + +#if !defined(NULL) +#define NULL ((void *)0) +#endif + +#undef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#undef A0 +#define A0 (NN) + +{ + int deg_lambda, el, deg_omega; + int i, j, r,k; + data_t u,q,tmp,num1,num2,den,discr_r; + data_t lambda[NROOTS+1], s[NROOTS]; /* Err+Eras Locator poly + * and syndrome poly */ + data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1]; + data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS]; + int syn_error, count; + + /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */ + for(i=0;i 0) { + /* Init lambda to be the erasure locator polynomial */ + lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))]; + for (i = 1; i < no_eras; i++) { + u = MODNN(PRIM*(NN-1-eras_pos[i])); + for (j = i+1; j > 0; j--) { + tmp = INDEX_OF[lambda[j - 1]]; + if(tmp != A0) + lambda[j] ^= ALPHA_TO[MODNN(u + tmp)]; + } + } + +#if DEBUG >= 1 + /* Test code that verifies the erasure locator polynomial just constructed + Needed only for decoder debugging. */ + + /* find roots of the erasure location polynomial */ + for(i=1;i<=no_eras;i++) + reg[i] = INDEX_OF[lambda[i]]; + + count = 0; + for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) { + q = 1; + for (j = 1; j <= no_eras; j++) + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + if (q != 0) + continue; + /* store root and error location number indices */ + root[count] = i; + loc[count] = k; + count++; + } + if (count != no_eras) { + printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras); + count = -1; + goto finish; + } +#if DEBUG >= 2 + printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n"); + for (i = 0; i < count; i++) + printf("%d ", loc[i]); + printf("\n"); +#endif +#endif + } + for(i=0;i 0; j--){ + if (reg[j] != A0) { + reg[j] = MODNN(reg[j] + j); + q ^= ALPHA_TO[reg[j]]; + } + } + if (q != 0) + continue; /* Not a root */ + /* store root (index-form) and error location number */ +#if DEBUG>=2 + printf("count %d root %d loc %d\n",count,i,k); +#endif + root[count] = i; + loc[count] = k; + /* If we've already found max possible roots, + * abort the search to save time + */ + if(++count == deg_lambda) + break; + } + if (deg_lambda != count) { + /* + * deg(lambda) unequal to number of roots => uncorrectable + * error detected + */ + count = -1; + goto finish; + } + /* + * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo + * x**NROOTS). in index form. Also find deg(omega). + */ + deg_omega = deg_lambda-1; + for (i = 0; i <= deg_omega;i++){ + tmp = 0; + for(j=i;j >= 0; j--){ + if ((s[i - j] != A0) && (lambda[j] != A0)) + tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])]; + } + omega[i] = INDEX_OF[tmp]; + } + + /* + * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 = + * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form + */ + for (j = count-1; j >=0; j--) { + num1 = 0; + for (i = deg_omega; i >= 0; i--) { + if (omega[i] != A0) + num1 ^= ALPHA_TO[MODNN(omega[i] + i * root[j])]; + } + num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)]; + den = 0; + + /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */ + for (i = MIN(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) { + if(lambda[i+1] != A0) + den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])]; + } +#if DEBUG >= 1 + if (den == 0) { + printf("\n ERROR: denominator = 0\n"); + count = -1; + goto finish; + } +#endif + /* Apply error to data */ + if (num1 != 0 && loc[j] >= PAD) { + data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])]; + } + } + finish: + if(eras_pos != NULL){ + for(i=0;i +#endif + +#include + +#include "fixed.h" + +int decode_rs_8(data_t *data, int *eras_pos, int no_eras, int pad){ + int retval; + + if(pad < 0 || pad > 222){ + return -1; + } + +#include "decode_rs.h" + + return retval; +} diff --git a/libfec/decode_rs_ccsds.c b/libfec/decode_rs_ccsds.c new file mode 100644 index 0000000..0e246b4 --- /dev/null +++ b/libfec/decode_rs_ccsds.c @@ -0,0 +1,26 @@ +/* This function wraps around the fixed 8-bit decoder, performing the + * basis transformations necessary to meet the CCSDS standard + * + * Copyright 2002, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include "ccsds.h" +#include "fec.h" + +int decode_rs_ccsds(data_t *data,int *eras_pos,int no_eras,int pad){ + int i,r; + data_t cdata[NN]; + + /* Convert data from dual basis to conventional */ + for(i=0;i 0){ + /* Convert from conventional to dual basis */ + for(i=0;i +#endif + +#include + +#include "char.h" +#include "rs-common.h" + +int decode_rs_char(void *p, data_t *data, int *eras_pos, int no_eras){ + int retval; + struct rs *rs = (struct rs *)p; + +#include "decode_rs.h" + + return retval; +} diff --git a/libfec/decode_rs_int.c b/libfec/decode_rs_int.c new file mode 100644 index 0000000..1ef1a1f --- /dev/null +++ b/libfec/decode_rs_int.c @@ -0,0 +1,22 @@ +/* General purpose Reed-Solomon decoder + * Copyright 2003 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifdef DEBUG +#include +#endif + +#include + +#include "int.h" +#include "rs-common.h" + +int decode_rs_int(void *p, data_t *data, int *eras_pos, int no_eras){ + int retval; + struct rs *rs = (struct rs *)p; + +#include "decode_rs.h" + + return retval; +} diff --git a/libfec/dotprod.c b/libfec/dotprod.c new file mode 100644 index 0000000..5fb1da9 --- /dev/null +++ b/libfec/dotprod.c @@ -0,0 +1,111 @@ +/* 16-bit signed integer dot product + * Switch to appropriate versions + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include "fec.h" + +void *initdp_port(signed short coeffs[],int len); +long dotprod_port(void *p,signed short *b); +void freedp_port(void *p); + +#ifdef __i386__ +void *initdp_mmx(signed short coeffs[],int len); +void *initdp_sse2(signed short coeffs[],int len); +long dotprod_mmx(void *p,signed short *b); +long dotprod_sse2(void *p,signed short *b); +void freedp_mmx(void *p); +void freedp_sse2(void *p); +#endif + +#ifdef __VEC__ +void *initdp_av(signed short coeffs[],int len); +long dotprod_av(void *p,signed short *b); +void freedp_av(void *p); +#endif + +/* Create and return a descriptor for use with the dot product function */ +void *initdp(signed short coeffs[],int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return initdp_port(coeffs,len); +#ifdef __i386__ + case MMX: + case SSE: + return initdp_mmx(coeffs,len); + case SSE2: + return initdp_sse2(coeffs,len); +#endif + +#ifdef __x86_64__ + case SSE2: + return initdp_port(coeffs,len); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return initdp_av(coeffs,len); +#endif + } +} + + +/* Free a dot product descriptor created earlier */ +void freedp(void *p){ + switch(Cpu_mode){ + case PORT: + default: + return freedp_port(p); +#ifdef __i386__ + case MMX: + case SSE: + return freedp_mmx(p); + case SSE2: + return freedp_sse2(p); +#endif + +#ifdef __x86_64__ + case SSE2: + return freedp_port(p); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return freedp_av(p); +#endif + } +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod(void *p,signed short a[]){ + switch(Cpu_mode){ + case PORT: + default: + return dotprod_port(p,a); +#ifdef __i386__ + case MMX: + case SSE: + return dotprod_mmx(p,a); + case SSE2: + return dotprod_sse2(p,a); +#endif + +#ifdef __x86_64__ + case SSE2: + return dotprod_port(p,a); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return dotprod_av(p,a); +#endif + } +} + + diff --git a/libfec/dotprod.h b/libfec/dotprod.h new file mode 100644 index 0000000..6b62b70 --- /dev/null +++ b/libfec/dotprod.h @@ -0,0 +1,15 @@ +/* Internal definitions for dotproduct function */ + +struct dotprod { + int len; /* Number of coefficients */ + + /* On a MMX or SSE machine, these hold 4 copies of the coefficients, + * preshifted by 0,1,2,3 words to meet all possible input data + * alignments (see Intel ap559 on MMX dot products). + * + * SSE2 is similar, but with 8 words at a time + * + * On a non-MMX machine, only one copy is present + */ + signed short *coeffs[8]; +}; diff --git a/libfec/dotprod_av.c b/libfec/dotprod_av.c new file mode 100644 index 0000000..1f70471 --- /dev/null +++ b/libfec/dotprod_av.c @@ -0,0 +1,93 @@ +/* 16-bit signed integer dot product + * Altivec-assisted version + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + /* On an Altivec machine, these hold 8 copies of the coefficients, + * preshifted by 0,1,..7 words to meet all possible input data + */ + signed short *coeffs[8]; +}; + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_av(signed short coeffs[],int len){ + struct dotprod *dp; + int i,j; + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Make 8 copies of coefficients, one for each data alignment, + * each aligned to 16-byte boundary + */ + for(i=0;i<8;i++){ + dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short)); + for(j=0;jcoeffs[i][j+i] = coeffs[j]; + } + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_av(void *p){ + struct dotprod *dp = (struct dotprod *)p; + int i; + + for(i=0;i<8;i++) + if(dp->coeffs[i] != NULL) + free(dp->coeffs[i]); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_av(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + int al; + vector signed short *ar,*d; + vector signed int sums0,sums1,sums2,sums3; + union { vector signed int v; signed int w[4];} s; + int nblocks; + + /* round ar down to beginning of 16-byte block containing 0th element of + * input buffer. Then set d to one of 8 sets of shifted coefficients + */ + ar = (vector signed short *)((int)a & ~15); + al = ((int)a & 15)/sizeof(signed short); + d = (vector signed short *)dp->coeffs[al]; + + nblocks = (dp->len+al-1)/8+1; + + /* Sum into four vectors each holding four 32-bit partial sums */ + sums3 = sums2 = sums1 = sums0 = (vector signed int)(0); + while(nblocks >= 4){ + sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0); + sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1); + sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2); + sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3); + nblocks -= 4; + } + sums0 = vec_adds(sums0,sums1); + sums2 = vec_adds(sums2,sums3); + sums0 = vec_adds(sums0,sums2); + while(nblocks-- > 0){ + sums0 = vec_msums(ar[nblocks],d[nblocks],sums0); + } + /* Sum 4 partial sums into final result */ + s.v = vec_sums(sums0,(vector signed int)(0)); + + return s.w[3]; +} + + diff --git a/libfec/dotprod_mmx.c b/libfec/dotprod_mmx.c new file mode 100644 index 0000000..c516afe --- /dev/null +++ b/libfec/dotprod_mmx.c @@ -0,0 +1,81 @@ +/* 16-bit signed integer dot product + * MMX assisted version; also for SSE + * + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + /* On a MMX or SSE machine, these hold 4 copies of the coefficients, + * preshifted by 0,1,2,3 words to meet all possible input data + * alignments (see Intel ap559 on MMX dot products). + */ + signed short *coeffs[4]; +}; +long dotprod_mmx_assist(signed short *a,signed short *b,int cnt); + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_mmx(signed short coeffs[],int len){ + struct dotprod *dp; + int i,j; + + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Make 4 copies of coefficients, one for each data alignment */ + for(i=0;i<4;i++){ + dp->coeffs[i] = (signed short *)calloc(1+(len+i-1)/4, + 4*sizeof(signed short)); + for(j=0;jcoeffs[i][j+i] = coeffs[j]; + } + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_mmx(void *p){ + struct dotprod *dp = (struct dotprod *)p; + int i; + + for(i=0;i<4;i++) + if(dp->coeffs[i] != NULL) + free(dp->coeffs[i]); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_mmx(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + int al; + signed short *ar; + + /* Round input data address down to 8 byte boundary + * NB: depending on the alignment of a[], memory + * before a[] will be accessed. The contents don't matter since they'll + * be multiplied by zero coefficients. I can't conceive of any + * situation where this could cause a segfault since memory protection + * in the x86 machines is done on much larger boundaries + */ + ar = (signed short *)((int)a & ~7); + + /* Choose one of 4 sets of pre-shifted coefficients. al is both the + * index into dp->coeffs[] and the number of 0 words padded onto + * that coefficients array for alignment purposes + */ + al = a - ar; + + /* Call assembler routine to do the work, passing number of 4-word blocks */ + return dotprod_mmx_assist(ar,dp->coeffs[al],(dp->len+al-1)/4+1); +} + diff --git a/libfec/dotprod_mmx_assist.s b/libfec/dotprod_mmx_assist.s new file mode 100644 index 0000000..25deffd --- /dev/null +++ b/libfec/dotprod_mmx_assist.s @@ -0,0 +1,83 @@ +# SIMD MMX dot product +# Equivalent to the following C code: +# long dotprod(signed short *a,signed short *b,int cnt) +# { +# long sum = 0; +# cnt *= 4; +# while(cnt--) +# sum += *a++ + *b++; +# return sum; +# } +# a and b should also be 64-bit aligned, or speed will suffer greatly +# Copyright 1999, Phil Karn KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + .global dotprod_mmx_assist + .type dotprod_mmx_assist,@function +dotprod_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %ecx + pushl %ebx + movl 8(%ebp),%esi # a + movl 12(%ebp),%edi # b + movl 16(%ebp),%ecx # cnt + pxor %mm0,%mm0 # clear running sum (in two 32-bit halves) + +# MMX dot product loop unrolled 4 times, crunching 16 terms per loop + .align 16 +.Loop1: subl $4,%ecx + jl .Loop1Done + + movq (%esi),%mm1 # mm1 = a[3],a[2],a[1],a[0] + pmaddwd (%edi),%mm1 # mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0] + paddd %mm1,%mm0 + + movq 8(%esi),%mm1 + pmaddwd 8(%edi),%mm1 + paddd %mm1,%mm0 + + movq 16(%esi),%mm1 + pmaddwd 16(%edi),%mm1 + paddd %mm1,%mm0 + + movq 24(%esi),%mm1 + addl $32,%esi + pmaddwd 24(%edi),%mm1 + addl $32,%edi + paddd %mm1,%mm0 + + jmp .Loop1 +.Loop1Done: + + addl $4,%ecx + +# MMX dot product loop, not unrolled, crunching 4 terms per loop +# This could be redone as Duff's Device on the unrolled loop above +.Loop2: subl $1,%ecx + jl .Loop2Done + + movq (%esi),%mm1 + addl $8,%esi + pmaddwd (%edi),%mm1 + addl $8,%edi + paddd %mm1,%mm0 + jmp .Loop2 +.Loop2Done: + + movd %mm0,%ebx # right-hand word to ebx + punpckhdq %mm0,%mm0 # left-hand word to right side of %mm0 + movd %mm0,%eax + addl %ebx,%eax # running sum now in %eax + emms # done with MMX + + popl %ebx + popl %ecx + popl %edi + popl %esi + movl %ebp,%esp + popl %ebp + ret diff --git a/libfec/dotprod_port.c b/libfec/dotprod_port.c new file mode 100644 index 0000000..ef635ec --- /dev/null +++ b/libfec/dotprod_port.c @@ -0,0 +1,58 @@ +/* 16-bit signed integer dot product + * Portable C version + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + signed short *coeffs; +}; + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_port(signed short coeffs[],int len){ + struct dotprod *dp; + int j; + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Just one copy of the coefficients for the C version */ + dp->coeffs = (signed short *)calloc(len,sizeof(signed short)); + for(j=0;jcoeffs[j] = coeffs[j]; + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_port(void *p){ + struct dotprod *dp = (struct dotprod *)p; + + if(dp->coeffs != NULL) + free(dp->coeffs); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_port(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + long corr; + int i; + + corr = 0; + for(i=0;ilen;i++){ + corr += (long)a[i] * dp->coeffs[i]; + } + return corr; +} + + diff --git a/libfec/dotprod_sse2.c b/libfec/dotprod_sse2.c new file mode 100644 index 0000000..1fddd18 --- /dev/null +++ b/libfec/dotprod_sse2.c @@ -0,0 +1,72 @@ +/* 16-bit signed integer dot product + * SSE2 version + * Copyright 2004 Phil Karn + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#define _XOPEN_SOURCE 600 +#include +#include +#include "fec.h" + +struct dotprod { + int len; /* Number of coefficients */ + + /* On a SSE2 machine, these hold 8 copies of the coefficients, + * preshifted by 0,1,..7 words to meet all possible input data + * alignments (see Intel ap559 on MMX dot products). + */ + signed short *coeffs[8]; +}; + +long dotprod_sse2_assist(signed short *a,signed short *b,int cnt); + +/* Create and return a descriptor for use with the dot product function */ +void *initdp_sse2(signed short coeffs[],int len){ + struct dotprod *dp; + int i,j,blksize; + + if(len == 0) + return NULL; + + dp = (struct dotprod *)calloc(1,sizeof(struct dotprod)); + dp->len = len; + + /* Make 8 copies of coefficients, one for each data alignment, + * each aligned to 16-byte boundary + */ + for(i=0;i<8;i++){ + blksize = (1+(len+i-1)/8) * 8*sizeof(signed short); + posix_memalign((void **)&dp->coeffs[i],16,blksize); + memset(dp->coeffs[i],0,blksize); + for(j=0;jcoeffs[i][j+i] = coeffs[j]; + } + return (void *)dp; +} + + +/* Free a dot product descriptor created earlier */ +void freedp_sse2(void *p){ + struct dotprod *dp = (struct dotprod *)p; + int i; + + for(i=0;i<8;i++) + if(dp->coeffs[i] != NULL) + free(dp->coeffs[i]); + free(dp); +} + +/* Compute a dot product given a descriptor and an input array + * The length is taken from the descriptor + */ +long dotprod_sse2(void *p,signed short a[]){ + struct dotprod *dp = (struct dotprod *)p; + int al; + signed short *ar; + + ar = (signed short *)((int)a & ~15); + al = a - ar; + + /* Call assembler routine to do the work, passing number of 8-word blocks */ + return dotprod_sse2_assist(ar,dp->coeffs[al],(dp->len+al-1)/8+1); +} diff --git a/libfec/dotprod_sse2_assist.s b/libfec/dotprod_sse2_assist.s new file mode 100644 index 0000000..47348fa --- /dev/null +++ b/libfec/dotprod_sse2_assist.s @@ -0,0 +1,85 @@ +# SIMD SSE2 dot product +# Equivalent to the following C code: +# long dotprod(signed short *a,signed short *b,int cnt) +# { +# long sum = 0; +# cnt *= 8; +# while(cnt--) +# sum += *a++ + *b++; +# return sum; +# } +# a and b must be 128-bit aligned +# Copyright 2001, Phil Karn KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + .global dotprod_sse2_assist + .type dotprod_sse2_assist,@function +dotprod_sse2_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %ecx + pushl %ebx + movl 8(%ebp),%esi # a + movl 12(%ebp),%edi # b + movl 16(%ebp),%ecx # cnt + pxor %xmm0,%xmm0 # clear running sum (in two 32-bit halves) + +# SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop + .align 16 +.Loop1: subl $4,%ecx + jl .Loop1Done + + movdqa (%esi),%xmm1 + pmaddwd (%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 16(%esi),%xmm1 + pmaddwd 16(%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 32(%esi),%xmm1 + pmaddwd 32(%edi),%xmm1 + paddd %xmm1,%xmm0 + + movdqa 48(%esi),%xmm1 + addl $64,%esi + pmaddwd 48(%edi),%xmm1 + addl $64,%edi + paddd %xmm1,%xmm0 + + jmp .Loop1 +.Loop1Done: + + addl $4,%ecx + +# SSE2 dot product loop, not unrolled, crunching 4 terms per loop +# This could be redone as Duff's Device on the unrolled loop above +.Loop2: subl $1,%ecx + jl .Loop2Done + + movdqa (%esi),%xmm1 + addl $16,%esi + pmaddwd (%edi),%xmm1 + addl $16,%edi + paddd %xmm1,%xmm0 + jmp .Loop2 +.Loop2Done: + + movdqa %xmm0,%xmm1 + psrldq $8,%xmm0 + paddd %xmm1,%xmm0 + movd %xmm0,%eax # right-hand word to eax + psrldq $4,%xmm0 + movd %xmm0,%ebx + addl %ebx,%eax + + popl %ebx + popl %ecx + popl %edi + popl %esi + movl %ebp,%esp + popl %ebp + ret diff --git a/libfec/dsp.3 b/libfec/dsp.3 new file mode 100644 index 0000000..e9794da --- /dev/null +++ b/libfec/dsp.3 @@ -0,0 +1,63 @@ +.TH DSP 3 +.SH NAME +initdp, freedp, dotprod, sumsq, peakval -\ SIMD-assisted +digital signal processing primitives +.SH SYNOPSIS +.nf +.ft +#include "fec.h" + +void *initdp(signed short *coeffs,int len); +long dotprod(void *p,signed short *a); +void freedp(void *p); + +unsigned long long sumsq(signed short *in,int cnt); + +int peakval(signed short *b,int cnt); + +.SH DESCRIPTION +These functions provide several basic primitives useful in digital +signal processing (DSP), especially in modems. The \fBinitdp\fR, +\fBdotprod\fR and \fBfreedp\fR functions implement an integer dot +product useful in correlation and filtering operations on signed +16-bit integers. \fBsumsq\fR computes the sum +of the squares of an array of signed 16-bit integers, +useful for measuring the energy of a signal. \fBpeakval\fR returns the +absolute value of the largest magitude element in the input array, +useful for scaling a signal's amplitude. + +Each function uses IA32 or PowerPC Altivec instructions when +available; otherwise, a portable C version is used. + +.SH USAGE +To create a FIR filter or correlator, call \fBinitdp\fR with the +coefficients in \fBcoeff\fR and their number in \fBlen\fR. This +creates the appropriate data structures and returns a handle. + +To compute a dot product, pass the handle from \fBinitdp\fR and the +input array to \fBdotprod\fR. No length field is needed as the number +of samples will be taken from the \fBlen\fR parameter originally given +to \fBinitdp\fR. There must be at least as many samples in the input +array as there were coefficients passed to \fBinitdp\fR. + +When the filter or correlator is no longer needed, the data structures +may be freed by passing the handle to \fBfreedp\fR. + +The user is responsible for scaling the inputs to \fBinitdp\fR and +\fBdotprod\fR, as the 32-bit result from \fBdotprod\fR will silently +wrap around in the event of overflow. + +To compute the sum of the squares of an array of signed 16-bit +integers, use sumsq\fR. This returns a 64 bit sum. + +\fBpeakval\fR computes the absolute value of each 16-bit element in +the input array and returns the largest. + +.SH RETURN VALUES + +\fBinitdp\fR returns a handle that points to a control block, or NULL in +the event of an error (such as a memory allocation failure). \fBsumsq\fR +and \fBpeakval\fR have no error returns. + +.SH AUTHOR and COPYRIGHT +Phil Karn, KA9Q (karn@ka9q.net) diff --git a/libfec/dtest.c b/libfec/dtest.c new file mode 100644 index 0000000..394cb03 --- /dev/null +++ b/libfec/dtest.c @@ -0,0 +1,99 @@ +/* Test dot-product function */ + +#include +#include +#include +#include +#include "config.h" +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {"trials",0,NULL,'n'}, + {NULL}, +}; +#endif + +int main(int argc,char *argv[]){ + short coeffs[512]; + short input[2048]; + int trials=1000,d; + int errors = 0; + +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"apmstn:",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"apmstn:")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'n': + trials = atoi(optarg); + break; + } + } + + while(trials--){ + long port_result; + long simd_result; + int ntaps; + int i; + int csum = 0; + int offset; + void *dp_simd,*dp_port; + + /* Generate set of coefficients + * limit sum of absolute values to 32767 to avoid overflow + */ + memset(coeffs,0,sizeof(coeffs)); + for(i=0;i<512;i++){ + double gv; + + gv = normal_rand(0.,100.); + if(csum + fabs(gv) > 32767) + break; + coeffs[i] = gv; + csum += fabs(gv); + } + ntaps = i; + + /* Compare results to portable C version for a bunch of random data buffers and offsets */ + dp_simd = initdp(coeffs,ntaps); + dp_port = initdp_port(coeffs,ntaps); + + for(i=0;i<2048;i++) + input[i] = random(); + + offset = random() & 511; + + simd_result = dotprod(dp_simd,input+offset); + port_result = dotprod_port(dp_port,input+offset); + if(simd_result != port_result){ + errors++; + } + } + printf("dtest: %d errors\n",errors); + exit(0); +} diff --git a/libfec/encode_rs.c b/libfec/encode_rs.c new file mode 100644 index 0000000..0649094 --- /dev/null +++ b/libfec/encode_rs.c @@ -0,0 +1,52 @@ +/* Reed-Solomon encoder + * Copyright 2002, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include + +#ifdef FIXED +#include "fixed.h" +#elif defined(BIGSYM) +#include "int.h" +#else +#include "char.h" +#endif + +void ENCODE_RS( +#ifdef FIXED +data_t *data, data_t *bb,int pad){ +#else +void *p,data_t *data, data_t *bb){ + struct rs *rs = (struct rs *)p; +#endif + int i, j; + data_t feedback; + +#ifdef FIXED + /* Check pad parameter for validity */ + if(pad < 0 || pad >= NN) + return; +#endif + + memset(bb,0,NROOTS*sizeof(data_t)); + + for(i=0;i) must be included by the calling + * program. + + * Copyright 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + + +#undef A0 +#define A0 (NN) /* Special reserved value encoding zero in index form */ + +{ + int i, j; + data_t feedback; + + memset(parity,0,NROOTS*sizeof(data_t)); + + for(i=0;i +#include "fixed.h" +#ifdef __VEC__ +#include +#endif + + +static enum {UNKNOWN=0,MMX,SSE,SSE2,ALTIVEC,PORT} cpu_mode; + +static void encode_rs_8_c(data_t *data, data_t *parity,int pad); +#if __vec__ +static void encode_rs_8_av(data_t *data, data_t *parity,int pad); +#endif +#if __i386__ +int cpu_features(void); +#endif + +void encode_rs_8(data_t *data, data_t *parity,int pad){ + if(cpu_mode == UNKNOWN){ +#ifdef __i386__ + int f; + /* Figure out what kind of CPU we have */ + f = cpu_features(); + if(f & (1<<26)){ /* SSE2 is present */ + cpu_mode = SSE2; + } else if(f & (1<<25)){ /* SSE is present */ + cpu_mode = SSE; + } else if(f & (1<<23)){ /* MMX is present */ + cpu_mode = MMX; + } else { /* No SIMD at all */ + cpu_mode = PORT; + } +#elif __x86_64__ + cpu_mode = SSE2; +#elif __VEC__ + /* Ask the OS if we have Altivec support */ + int selectors[2] = { CTL_HW, HW_VECTORUNIT }; + int hasVectorUnit = 0; + size_t length = sizeof(hasVectorUnit); + int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); + if(0 == error && hasVectorUnit) + cpu_mode = ALTIVEC; + else + cpu_mode = PORT; +#else + cpu_mode = PORT; +#endif + } + switch(cpu_mode){ +#if __vec__ + case ALTIVEC: + encode_rs_8_av(data,parity,pad); + return; +#endif + +#if __i386__ + case MMX: + case SSE: + case SSE2: +#endif + +#ifdef __x86_64__ + case SSE2: +#endif + + default: + encode_rs_8_c(data,parity,pad); + return; + } +} + +#if __vec__ /* PowerPC G4/G5 Altivec instructions are available */ + +static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); +static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30); + +/* Lookup table for feedback multiplications + * These are the low half of the coefficients. Since the generator polynomial is + * palindromic, we form the other half by reversing this one + */ +extern static union { vector unsigned char v; unsigned char c[16]; } table[256]; + +static void encode_rs_8_av(data_t *data, data_t *parity,int pad){ + union { vector unsigned char v[2]; unsigned char c[32]; } shift_register; + int i; + + shift_register.v[0] = (vector unsigned char)(0); + shift_register.v[1] = (vector unsigned char)(0); + + for(i=0;i +#include +#include "fixed.h" + +/* Lookup table for feedback multiplications + * These are the low half of the coefficients. Since the generator polynomial is + * palindromic, we form it by reversing these on the fly + */ +static union { vector unsigned char v; unsigned char c[16]; } table[256]; + +static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1); +static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30); + +extern data_t CCSDS_alpha_to[]; +extern data_t CCSDS_index_of[]; +extern data_t CCSDS_poly[]; + +void rs_init_av(){ + int i,j; + + /* The PowerPC is big-endian, so the low-order byte of each vector contains the highest order term in the polynomial */ + for(j=0;j<16;j++){ + table[0].c[j] = 0; + for(i=1;i<256;i++){ + table[i].c[16-j-1] = CCSDS_alpha_to[MODNN(CCSDS_poly[j+1] + CCSDS_index_of[i])]; + } + } +#if 0 + for(i=0;i<256;i++){ + printf("table[%3d] = %3vu\n",i,table[i].v); + } +#endif +} + +void encode_rs_av(unsigned char *data,unsigned char *parity,int pad){ + union { vector unsigned char v[2]; unsigned char c[32]; } shift_register; + int i; + + shift_register.v[0] = (vector unsigned char)(0); + shift_register.v[1] = (vector unsigned char)(0); + + for(i=0;i + +#include "char.h" +#include "rs-common.h" + +void encode_rs_char(void *p,data_t *data, data_t *parity){ + struct rs *rs = (struct rs *)p; + +#include "encode_rs.h" + +} diff --git a/libfec/encode_rs_int.c b/libfec/encode_rs_int.c new file mode 100644 index 0000000..3c9ce78 --- /dev/null +++ b/libfec/encode_rs_int.c @@ -0,0 +1,15 @@ +/* Reed-Solomon encoder + * Copyright 2003, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include + +#include "int.h" +#include "rs-common.h" + +void encode_rs_int(void *p,data_t *data, data_t *parity){ + struct rs *rs = (struct rs *)p; + +#include "encode_rs.h" + +} diff --git a/libfec/exercise.c b/libfec/exercise.c new file mode 100644 index 0000000..8ae008c --- /dev/null +++ b/libfec/exercise.c @@ -0,0 +1,122 @@ +/* Exercise an RS codec a specified number of times using random + * data and error patterns + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#define FLAG_ERASURE 1 /* Randomly flag 50% of errors as erasures */ + +#include +#include +#include + +#ifdef FIXED +#include "fixed.h" +#define EXERCISE exercise_8 +#elif defined(CCSDS) +#include "fixed.h" +#include "ccsds.h" +#define EXERCISE exercise_ccsds +#elif defined(BIGSYM) +#include "int.h" +#define EXERCISE exercise_int +#else +#include "char.h" +#define EXERCISE exercise_char +#endif + +#ifdef FIXED +#define PRINTPARM printf("(255,223):"); +#elif defined(CCSDS) +#define PRINTPARM printf("CCSDS (255,223):"); +#else +#define PRINTPARM printf("(%d,%d):",rs->nn,rs->nn-rs->nroots); +#endif + +/* Exercise the RS codec passed as an argument */ +int EXERCISE( +#if !defined(CCSDS) && !defined(FIXED) +void *p, +#endif +int trials){ +#if !defined(CCSDS) && !defined(FIXED) + struct rs *rs = (struct rs *)p; +#endif + data_t block[NN],tblock[NN]; + int i; + int errors; + int errlocs[NN]; + int derrlocs[NROOTS]; + int derrors; + int errval,errloc; + int erasures; + int decoder_errors = 0; + + while(trials-- != 0){ + /* Test up to the error correction capacity of the code */ + for(errors=0;errors <= NROOTS/2;errors++){ + + /* Load block with random data and encode */ + for(i=0;i +#include "fec.h" + +unsigned char Partab[256]; +int P_init; + +/* Create 256-entry odd-parity lookup table + * Needed only on non-ia32 machines + */ +void partab_init(void){ + int i,cnt,ti; + + /* Initialize parity lookup table */ + for(i=0;i<256;i++){ + cnt = 0; + ti = i; + while(ti){ + if(ti & 1) + cnt++; + ti >>= 1; + } + Partab[i] = cnt & 1; + } + P_init=1; +} + +/* Lookup table giving count of 1 bits for integers 0-255 */ +int Bitcnt[] = { + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8, +}; + diff --git a/libfec/fec.h b/libfec/fec.h new file mode 100644 index 0000000..d6d4b08 --- /dev/null +++ b/libfec/fec.h @@ -0,0 +1,355 @@ +/* User include file for libfec + * Copyright 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#ifndef _FEC_H_ +#define _FEC_H_ + +/* r=1/2 k=7 convolutional encoder polynomials + * The NASA-DSN convention is to use V27POLYA inverted, then V27POLYB + * The CCSDS/NASA-GSFC convention is to use V27POLYB, then V27POLYA inverted + */ +#define V27POLYA 0x6d +#define V27POLYB 0x4f + +void *create_viterbi27(int len); +void set_viterbi27_polynomial(int polys[2]); +int init_viterbi27(void *vp,int starting_state); +int update_viterbi27_blk(void *vp,unsigned char sym[],int npairs); +int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27(void *vp); + +#ifdef __VEC__ +void *create_viterbi27_av(int len); +void set_viterbi27_polynomial_av(int polys[2]); +int init_viterbi27_av(void *p,int starting_state); +int chainback_viterbi27_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_av(void *p); +int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi27_mmx(int len); +void set_viterbi27_polynomial_mmx(int polys[2]); +int init_viterbi27_mmx(void *p,int starting_state); +int chainback_viterbi27_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_mmx(void *p); +int update_viterbi27_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi27_sse(int len); +void set_viterbi27_polynomial_sse(int polys[2]); +int init_viterbi27_sse(void *p,int starting_state); +int chainback_viterbi27_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_sse(void *p); +int update_viterbi27_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi27_sse2(int len); +void set_viterbi27_polynomial_sse2(int polys[2]); +int init_viterbi27_sse2(void *p,int starting_state); +int chainback_viterbi27_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_sse2(void *p); +int update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits); +#endif + +void *create_viterbi27_port(int len); +void set_viterbi27_polynomial_port(int polys[2]); +int init_viterbi27_port(void *p,int starting_state); +int chainback_viterbi27_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27_port(void *p); +int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits); + +/* r=1/2 k=9 convolutional encoder polynomials */ +#define V29POLYA 0x1af +#define V29POLYB 0x11d + +void *create_viterbi29(int len); +void set_viterbi29_polynomial(int polys[2]); +int init_viterbi29(void *vp,int starting_state); +int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29(void *vp); + +#ifdef __VEC__ +void *create_viterbi29_av(int len); +void set_viterbi29_polynomial_av(int polys[2]); +int init_viterbi29_av(void *p,int starting_state); +int chainback_viterbi29_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_av(void *p); +int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi29_mmx(int len); +void set_viterbi29_polynomial_mmx(int polys[2]); +int init_viterbi29_mmx(void *p,int starting_state); +int chainback_viterbi29_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_mmx(void *p); +int update_viterbi29_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi29_sse(int len); +void set_viterbi29_polynomial_sse(int polys[2]); +int init_viterbi29_sse(void *p,int starting_state); +int chainback_viterbi29_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_sse(void *p); +int update_viterbi29_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi29_sse2(int len); +void set_viterbi29_polynomial_sse2(int polys[2]); +int init_viterbi29_sse2(void *p,int starting_state); +int chainback_viterbi29_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_sse2(void *p); +int update_viterbi29_blk_sse2(void *p,unsigned char *syms,int nbits); +#endif + +void *create_viterbi29_port(int len); +void set_viterbi29_polynomial_port(int polys[2]); +int init_viterbi29_port(void *p,int starting_state); +int chainback_viterbi29_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29_port(void *p); +int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits); + +/* r=1/3 k=9 convolutional encoder polynomials */ +#define V39POLYA 0x1ed +#define V39POLYB 0x19b +#define V39POLYC 0x127 + +void *create_viterbi39(int len); +void set_viterbi39_polynomial(int polys[3]); +int init_viterbi39(void *vp,int starting_state); +int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39(void *vp); + +#ifdef __VEC__ +void *create_viterbi39_av(int len); +void set_viterbi39_polynomial_av(int polys[3]); +int init_viterbi39_av(void *p,int starting_state); +int chainback_viterbi39_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_av(void *p); +int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi39_mmx(int len); +void set_viterbi39_polynomial_mmx(int polys[3]); +int init_viterbi39_mmx(void *p,int starting_state); +int chainback_viterbi39_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_mmx(void *p); +int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi39_sse(int len); +void set_viterbi39_polynomial_sse(int polys[3]); +int init_viterbi39_sse(void *p,int starting_state); +int chainback_viterbi39_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_sse(void *p); +int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi39_sse2(int len); +void set_viterbi39_polynomial_sse2(int polys[3]); +int init_viterbi39_sse2(void *p,int starting_state); +int chainback_viterbi39_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_sse2(void *p); +int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits); +#endif + +void *create_viterbi39_port(int len); +void set_viterbi39_polynomial_port(int polys[3]); +int init_viterbi39_port(void *p,int starting_state); +int chainback_viterbi39_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39_port(void *p); +int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits); + + +/* r=1/6 k=15 Cassini convolutional encoder polynomials without symbol inversion + * dfree = 56 + * These bits may be left-right flipped from some textbook representations; + * here I have the bits entering the shift register from the right (low) end + * + * Some other spacecraft use the same code, but with the polynomials in a different order. + * E.g., Mars Pathfinder and STEREO swap POLYC and POLYD. All use alternate symbol inversion, + * so use set_viterbi615_polynomial() as appropriate. + */ +#define V615POLYA 042631 +#define V615POLYB 047245 +#define V615POLYC 056507 +#define V615POLYD 073363 +#define V615POLYE 077267 +#define V615POLYF 064537 + +void *create_viterbi615(int len); +void set_viterbi615_polynomial(int polys[6]); +int init_viterbi615(void *vp,int starting_state); +int update_viterbi615_blk(void *vp,unsigned char *syms,int nbits); +int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615(void *vp); + +#ifdef __VEC__ +void *create_viterbi615_av(int len); +void set_viterbi615_polynomial_av(int polys[6]); +int init_viterbi615_av(void *p,int starting_state); +int chainback_viterbi615_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_av(void *p); +int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits); +#endif + +#ifdef __i386__ +void *create_viterbi615_mmx(int len); +void set_viterbi615_polynomial_mmx(int polys[6]); +int init_viterbi615_mmx(void *p,int starting_state); +int chainback_viterbi615_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_mmx(void *p); +int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits); + +void *create_viterbi615_sse(int len); +void set_viterbi615_polynomial_sse(int polys[6]); +int init_viterbi615_sse(void *p,int starting_state); +int chainback_viterbi615_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_sse(void *p); +int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits); + +void *create_viterbi615_sse2(int len); +void set_viterbi615_polynomial_sse2(int polys[6]); +int init_viterbi615_sse2(void *p,int starting_state); +int chainback_viterbi615_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_sse2(void *p); +int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits); +#endif + +void *create_viterbi615_port(int len); +void set_viterbi615_polynomial_port(int polys[6]); +int init_viterbi615_port(void *p,int starting_state); +int chainback_viterbi615_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615_port(void *p); +int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits); + + +/* General purpose RS codec, 8-bit symbols */ +void encode_rs_char(void *rs,unsigned char *data,unsigned char *parity); +int decode_rs_char(void *rs,unsigned char *data,int *eras_pos, + int no_eras); +void *init_rs_char(int symsize,int gfpoly, + int fcr,int prim,int nroots, + int pad); +void free_rs_char(void *rs); + +/* General purpose RS codec, integer symbols */ +void encode_rs_int(void *rs,int *data,int *parity); +int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras); +void *init_rs_int(int symsize,int gfpoly,int fcr, + int prim,int nroots,int pad); +void free_rs_int(void *rs); + +/* CCSDS standard (255,223) RS codec with conventional (*not* dual-basis) + * symbol representation + */ +void encode_rs_8(unsigned char *data,unsigned char *parity,int pad); +int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras,int pad); + +/* CCSDS standard (255,223) RS codec with dual-basis symbol representation */ +void encode_rs_ccsds(unsigned char *data,unsigned char *parity,int pad); +int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras,int pad); + +/* Tables to map from conventional->dual (Taltab) and + * dual->conventional (Tal1tab) bases + */ +extern unsigned char Taltab[],Tal1tab[]; + + +/* CPU SIMD instruction set available */ +extern enum cpu_mode {UNKNOWN=0,PORT,MMX,SSE,SSE2,ALTIVEC} Cpu_mode; +void find_cpu_mode(void); /* Call this once at startup to set Cpu_mode */ + +/* Determine parity of argument: 1 = odd, 0 = even */ +#if defined(__i386__) || defined(__x86_64__) +static inline int parityb(unsigned char x){ + __asm__ __volatile__ ("test %1,%1;setpo %0" : "=q"(x) : "q" (x)); + return x; +} +#else +void partab_init(); + +static inline int parityb(unsigned char x){ + extern unsigned char Partab[256]; + extern int P_init; + if(!P_init){ + partab_init(); + } + return Partab[x]; +} +#endif + + +static inline int parity(int x){ + /* Fold down to one byte */ + x ^= (x >> 16); + x ^= (x >> 8); + return parityb(x); +} + +/* Useful utilities for simulation */ +double normal_rand(double mean, double std_dev); +unsigned char addnoise(int sym,double amp,double gain,double offset,int clip); + +extern int Bitcnt[]; + +/* Dot product functions */ +void *initdp(signed short coeffs[],int len); +void freedp(void *dp); +long dotprod(void *dp,signed short a[]); + +void *initdp_port(signed short coeffs[],int len); +void freedp_port(void *dp); +long dotprod_port(void *dp,signed short a[]); + +#ifdef __i386__ +void *initdp_mmx(signed short coeffs[],int len); +void freedp_mmx(void *dp); +long dotprod_mmx(void *dp,signed short a[]); + +void *initdp_sse(signed short coeffs[],int len); +void freedp_sse(void *dp); +long dotprod_sse(void *dp,signed short a[]); + +void *initdp_sse2(signed short coeffs[],int len); +void freedp_sse2(void *dp); +long dotprod_sse2(void *dp,signed short a[]); +#endif + +#ifdef __x86_64__ +void *initdp_sse2(signed short coeffs[],int len); +void freedp_sse2(void *dp); +long dotprod_sse2(void *dp,signed short a[]); +#endif + +#ifdef __VEC__ +void *initdp_av(signed short coeffs[],int len); +void freedp_av(void *dp); +long dotprod_av(void *dp,signed short a[]); +#endif + +/* Sum of squares - accepts signed shorts, produces unsigned long long */ +unsigned long long sumsq(signed short *in,int cnt); +unsigned long long sumsq_port(signed short *in,int cnt); + +#ifdef __i386__ +unsigned long long sumsq_mmx(signed short *in,int cnt); +unsigned long long sumsq_sse(signed short *in,int cnt); +unsigned long long sumsq_sse2(signed short *in,int cnt); +#endif +#ifdef __x86_64__ +unsigned long long sumsq_sse2(signed short *in,int cnt); +#endif +#ifdef __VEC__ +unsigned long long sumsq_av(signed short *in,int cnt); +#endif + + +/* Low-level data structures and routines */ + +int cpu_features(void); + +#endif /* _FEC_H_ */ + + + diff --git a/libfec/fixed.h b/libfec/fixed.h new file mode 100644 index 0000000..0ff27b2 --- /dev/null +++ b/libfec/fixed.h @@ -0,0 +1,33 @@ +/* Stuff specific to the CCSDS (255,223) RS codec + * (255,223) code over GF(256). Note: the conventional basis is still + * used; the dual-basis mappings are performed in [en|de]code_rs_ccsds.c + * + * Copyright 2003 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +typedef unsigned char data_t; + +static inline int mod255(int x){ + while (x >= 255) { + x -= 255; + x = (x >> 8) + (x & 255); + } + return x; +} +#define MODNN(x) mod255(x) + +extern data_t CCSDS_alpha_to[]; +extern data_t CCSDS_index_of[]; +extern data_t CCSDS_poly[]; + +#define MM 8 +#define NN 255 +#define ALPHA_TO CCSDS_alpha_to +#define INDEX_OF CCSDS_index_of +#define GENPOLY CCSDS_poly +#define NROOTS 32 +#define FCR 112 +#define PRIM 11 +#define IPRIM 116 +#define PAD pad + diff --git a/libfec/gen_ccsds.c b/libfec/gen_ccsds.c new file mode 100644 index 0000000..e1e2e26 --- /dev/null +++ b/libfec/gen_ccsds.c @@ -0,0 +1,39 @@ +/* Generate tables for CCSDS code + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include "char.h" +#include "rs-common.h" +#include "fec.h" + +int main(){ + struct rs *rs; + int i; + + rs = init_rs_char(8,0x187,112,11,32,0); /* CCSDS standard */ + assert(rs != NULL); + printf("char CCSDS_alpha_to[] = {"); + for(i=0;i<256;i++){ + if((i % 16) == 0) + printf("\n"); + printf("0x%02x,",rs->alpha_to[i]); + } + printf("\n};\n\nchar CCSDS_index_of[] = {"); + for(i=0;i<256;i++){ + if((i % 16) == 0) + printf("\n"); + printf("%3d,",rs->index_of[i]); + } + printf("\n};\n\nchar CCSDS_poly[] = {"); + for(i=0;i<33;i++){ + if((i % 16) == 0) + printf("\n"); + + printf("%3d,",rs->genpoly[i]); + } + printf("\n};\n"); + exit(0); +} diff --git a/libfec/gen_ccsds_tal.c b/libfec/gen_ccsds_tal.c new file mode 100644 index 0000000..fc75503 --- /dev/null +++ b/libfec/gen_ccsds_tal.c @@ -0,0 +1,53 @@ +/* Conversion lookup tables from conventional alpha to Berlekamp's + * dual-basis representation. Used in the CCSDS version only. + * taltab[] -- convert conventional to dual basis + * tal1tab[] -- convert dual basis to conventional + + * Note: the actual RS encoder/decoder works with the conventional basis. + * So data is converted from dual to conventional basis before either + * encoding or decoding and then converted back. + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include + +#define DTYPE unsigned char +DTYPE Taltab[256],Tal1tab[256]; + +static DTYPE tal[] = { 0x8d, 0xef, 0xec, 0x86, 0xfa, 0x99, 0xaf, 0x7b }; + +/* Generate conversion lookup tables between conventional alpha representation + * (@**7, @**6, ...@**0) + * and Berlekamp's dual basis representation + * (l0, l1, ...l7) + */ +int main(){ + int i,j,k; + + for(i=0;i<256;i++){/* For each value of input */ + Taltab[i] = 0; + for(j=0;j<8;j++) /* for each column of matrix */ + for(k=0;k<8;k++){ /* for each row of matrix */ + if(i & (1< +#include "fec.h" + +#if !defined(NULL) +#define NULL ((void *)0) +#endif + +#include "rs-common.h" + +void free_rs(void *p){ + struct rs *rs = (struct rs *)p; + + free(rs->alpha_to); + free(rs->index_of); + free(rs->genpoly); + free(rs); +} + +/* Initialize a Reed-Solomon codec + * symsize = symbol size, bits + * gfpoly = Field generator polynomial coefficients + * fcr = first root of RS code generator polynomial, index form + * prim = primitive element to generate polynomial roots + * nroots = RS code generator polynomial degree (number of roots) + * pad = padding bytes at front of shortened block + */ +void *init_rs_common(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad){ + struct rs *rs; + +#include "init_rs.h" + + return rs; +} diff --git a/libfec/init_rs.h b/libfec/init_rs.h new file mode 100644 index 0000000..2b2ae98 --- /dev/null +++ b/libfec/init_rs.h @@ -0,0 +1,106 @@ +/* Common code for intializing a Reed-Solomon control block (char or int symbols) + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#undef NULL +#define NULL ((void *)0) + +{ + int i, j, sr,root,iprim; + + rs = NULL; + /* Check parameter ranges */ + if(symsize < 0 || symsize > 8*sizeof(data_t)){ + goto done; + } + + if(fcr < 0 || fcr >= (1<= (1<= (1<= ((1<mm = symsize; + rs->nn = (1<pad = pad; + + rs->alpha_to = (data_t *)malloc(sizeof(data_t)*(rs->nn+1)); + if(rs->alpha_to == NULL){ + free(rs); + rs = NULL; + goto done; + } + rs->index_of = (data_t *)malloc(sizeof(data_t)*(rs->nn+1)); + if(rs->index_of == NULL){ + free(rs->alpha_to); + free(rs); + rs = NULL; + goto done; + } + + /* Generate Galois field lookup tables */ + rs->index_of[0] = A0; /* log(zero) = -inf */ + rs->alpha_to[A0] = 0; /* alpha**-inf = 0 */ + sr = 1; + for(i=0;inn;i++){ + rs->index_of[sr] = i; + rs->alpha_to[i] = sr; + sr <<= 1; + if(sr & (1<nn; + } + if(sr != 1){ + /* field generator polynomial is not primitive! */ + free(rs->alpha_to); + free(rs->index_of); + free(rs); + rs = NULL; + goto done; + } + + /* Form RS code generator polynomial from its roots */ + rs->genpoly = (data_t *)malloc(sizeof(data_t)*(nroots+1)); + if(rs->genpoly == NULL){ + free(rs->alpha_to); + free(rs->index_of); + free(rs); + rs = NULL; + goto done; + } + rs->fcr = fcr; + rs->prim = prim; + rs->nroots = nroots; + + /* Find prim-th root of 1, used in decoding */ + for(iprim=1;(iprim % prim) != 0;iprim += rs->nn) + ; + rs->iprim = iprim / prim; + + rs->genpoly[0] = 1; + for (i = 0,root=fcr*prim; i < nroots; i++,root += prim) { + rs->genpoly[i+1] = 1; + + /* Multiply rs->genpoly[] by @**(root + x) */ + for (j = i; j > 0; j--){ + if (rs->genpoly[j] != 0) + rs->genpoly[j] = rs->genpoly[j-1] ^ rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[j]] + root)]; + else + rs->genpoly[j] = rs->genpoly[j-1]; + } + /* rs->genpoly[0] can never be zero */ + rs->genpoly[0] = rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[0]] + root)]; + } + /* convert rs->genpoly[] to index form for quicker encoding */ + for (i = 0; i <= nroots; i++) + rs->genpoly[i] = rs->index_of[rs->genpoly[i]]; + done:; + +} diff --git a/libfec/init_rs_char.c b/libfec/init_rs_char.c new file mode 100644 index 0000000..a51099a --- /dev/null +++ b/libfec/init_rs_char.c @@ -0,0 +1,35 @@ +/* Initialize a RS codec + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include + +#include "char.h" +#include "rs-common.h" + +void free_rs_char(void *p){ + struct rs *rs = (struct rs *)p; + + free(rs->alpha_to); + free(rs->index_of); + free(rs->genpoly); + free(rs); +} + +/* Initialize a Reed-Solomon codec + * symsize = symbol size, bits + * gfpoly = Field generator polynomial coefficients + * fcr = first root of RS code generator polynomial, index form + * prim = primitive element to generate polynomial roots + * nroots = RS code generator polynomial degree (number of roots) + * pad = padding bytes at front of shortened block + */ +void *init_rs_char(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad){ + struct rs *rs; + +#include "init_rs.h" + + return rs; +} diff --git a/libfec/init_rs_int.c b/libfec/init_rs_int.c new file mode 100644 index 0000000..a6036c2 --- /dev/null +++ b/libfec/init_rs_int.c @@ -0,0 +1,35 @@ +/* Initialize a RS codec + * + * Copyright 2002 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include + +#include "int.h" +#include "rs-common.h" + +void free_rs_int(void *p){ + struct rs *rs = (struct rs *)p; + + free(rs->alpha_to); + free(rs->index_of); + free(rs->genpoly); + free(rs); +} + +/* Initialize a Reed-Solomon codec + * symsize = symbol size, bits + * gfpoly = Field generator polynomial coefficients + * fcr = first root of RS code generator polynomial, index form + * prim = primitive element to generate polynomial roots + * nroots = RS code generator polynomial degree (number of roots) + * pad = padding bytes at front of shortened block + */ +void *init_rs_int(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad){ + struct rs *rs; + +#include "init_rs.h" + + return rs; +} diff --git a/libfec/install-sh b/libfec/install-sh new file mode 100755 index 0000000..e9de238 --- /dev/null +++ b/libfec/install-sh @@ -0,0 +1,251 @@ +#!/bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5 (mit/util/scripts/install.sh). +# +# Copyright 1991 by the Massachusetts Institute of Technology +# +# Permission to use, copy, modify, distribute, and sell this software and its +# documentation for any purpose is hereby granted without fee, provided that +# the above copyright notice appear in all copies and that both that +# copyright notice and this permission notice appear in supporting +# documentation, and that the name of M.I.T. not be used in advertising or +# publicity pertaining to distribution of the software without specific, +# written prior permission. M.I.T. makes no representations about the +# suitability of this software for any purpose. It is provided "as is" +# without express or implied warranty. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. It can only install one file at a time, a restriction +# shared with many OS's install programs. + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +transformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + chmodcmd="" + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/libfec/int.h b/libfec/int.h new file mode 100644 index 0000000..46e865d --- /dev/null +++ b/libfec/int.h @@ -0,0 +1,22 @@ +/* Stuff specific to the general (integer) version of the Reed-Solomon codecs + * + * Copyright 2003, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +typedef unsigned int data_t; + +#define MODNN(x) modnn(rs,x) + +#define MM (rs->mm) +#define NN (rs->nn) +#define ALPHA_TO (rs->alpha_to) +#define INDEX_OF (rs->index_of) +#define GENPOLY (rs->genpoly) +#define NROOTS (rs->nroots) +#define FCR (rs->fcr) +#define PRIM (rs->prim) +#define IPRIM (rs->iprim) +#define PAD (rs->pad) +#define A0 (NN) + + diff --git a/libfec/lesser.txt b/libfec/lesser.txt new file mode 100644 index 0000000..b1e3f5a --- /dev/null +++ b/libfec/lesser.txt @@ -0,0 +1,504 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/libfec/libfec.pc.in b/libfec/libfec.pc.in new file mode 100644 index 0000000..c569da9 --- /dev/null +++ b/libfec/libfec.pc.in @@ -0,0 +1,13 @@ +prefix=@LIBFEC_PC_PREFIX@ +exec_prefix=@LIBFEC_PC_EXEC_PREFIX@ +libdir=@LIBFEC_PC_LIBDIR@ +includedir=@LIBFEC_PC_INCLUDEDIR@ + +Name: FEC library +Description: A fork of KA9Q's FEC library +Version: @LIBFEC_PC_VERSION@ +URL: http://opendigitalradio.org +Cflags: -I${includedir}/ @LIBFEC_PC_CFLAGS@ +Libs: -L${libdir}/ @LIBFEC_PC_LIBS@ +Libs.private: @LIBFEC_PC_PRIV_LIBS@ + diff --git a/libfec/makefile.in b/libfec/makefile.in new file mode 100644 index 0000000..cc116ab --- /dev/null +++ b/libfec/makefile.in @@ -0,0 +1,249 @@ +# Makefile prototype for configure +# Copyright 2004 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + +# @configure_input@ +srcdir = @srcdir@ +prefix = @prefix@ +exec_prefix=@exec_prefix@ +VPATH = @srcdir@ +CC=@CC@ +LIBS=@MLIBS@ fec.o sim.o viterbi27.o viterbi27_port.o viterbi29.o viterbi29_port.o \ + viterbi39.o viterbi39_port.o \ + viterbi615.o viterbi615_port.o encode_rs_char.o encode_rs_int.o encode_rs_8.o \ + decode_rs_char.o decode_rs_int.o decode_rs_8.o \ + init_rs_char.o init_rs_int.o ccsds_tab.o \ + encode_rs_ccsds.o decode_rs_ccsds.o ccsds_tal.o \ + dotprod.o dotprod_port.o \ + peakval.o peakval_port.o \ + sumsq.o sumsq_port.o + +CFLAGS=@CFLAGS@ -I. -fPIC -Wall @ARCH_OPTION@ + +SHARED_LIB=@SH_LIB@ + +all: libfec.a $(SHARED_LIB) + + +test: vtest27 vtest29 vtest39 vtest615 rstest dtest sumsq_test peaktest + @echo "Correctness tests:" + ./vtest27 -e 3.0 -n 1000 -v + ./vtest29 -e 2.5 -n 1000 -v + ./vtest39 -e 2.5 -n 1000 -v + ./vtest615 -e 1.0 -n 100 -v + ./rstest + ./dtest + ./sumsq_test + ./peaktest + @echo "Speed tests:" + ./vtest27 + ./vtest29 + ./vtest39 + ./vtest615 + +install: all + mkdir -p $(DESTDIR)@libdir@ + install -m 644 -p $(SHARED_LIB) libfec.a $(DESTDIR)@libdir@ +# (cd $(DESTDIR)@libdir@;ln -f -s $(SHARED_LIB) libfec.so) + @REBIND@ + mkdir -p $(DESTDIR)@includedir@ + install -m 644 -p fec.h $(DESTDIR)@includedir@ + mkdir -m 0755 -p $(DESTDIR)@mandir@/man3 + install -m 644 -p simd-viterbi.3 rs.3 dsp.3 $(DESTDIR)@mandir@/man3 + +peaktest: peaktest.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ + +sumsq_test: sumsq_test.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ + +dtest: dtest.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ -lm + +vtest27: vtest27.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ -lm + +vtest29: vtest29.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ -lm + +vtest39: vtest39.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ -lm + +vtest615: vtest615.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ -lm + +rstest: rstest.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ + +rs_speedtest: rs_speedtest.o libfec.a + gcc $(CFLAGS) -g -o $@ $^ + +# for some reason, the test programs without args segfault on the PPC with -O2 optimization. Dunno why - compiler bug? +vtest27.o: vtest27.c fec.h + gcc $(CFLAGS) -g -c $< + +vtest29.o: vtest29.c fec.h + gcc $(CFLAGS) -g -c $< + +vtest39.o: vtest39.c fec.h + gcc $(CFLAGS) -g -c $< + +vtest615.o: vtest615.c fec.h + gcc $(CFLAGS) -g -c $< + +libfec.a: $(LIBS) + ar rv $@ $^ + ranlib libfec.a + +# for Darwin +libfec.dylib: $(LIBS) + $(CC) -dynamiclib -install_name $@ -o $@ $^ + +# for Linux et al +libfec.so: $(LIBS) + gcc -fPIC -shared -Xlinker -soname=$@ -o $@ -Wl,-whole-archive $^ -Wl,-no-whole-archive -lc -lm + +dotprod.o: dotprod.c fec.h + +dotprod_port.o: dotprod_port.c fec.h + +viterbi27.o: viterbi27.c fec.h + +viterbi27_port.o: viterbi27_port.c fec.h + +viterbi29.o: viterbi29.c fec.h + +viterbi39.o: viterbi39.c fec.h + +viterbi39_port.o: viterbi39_port.c fec.h + +viterbi39_sse2.o: viterbi39_sse2.c fec.h + +viterbi39_sse.o: viterbi39_sse.c fec.h + +viterbi39_mmx.o: viterbi39_mmx.c fec.h + +encode_rs_char.o: encode_rs_char.c char.h rs-common.h + +encode_rs_int.o: encode_rs_int.c int.h rs-common.h + +encode_rs_8.o: encode_rs_8.c fixed.h + +encode_rs_av.o: encode_rs_av.c fixed.h + +decode_rs_char.o: decode_rs_char.c char.h rs-common.h + +decode_rs_int.o: decode_rs_int.c int.h rs-common.h + +decode_rs_8.o: decode_rs_8.c fixed.h + +init_rs_char.o: init_rs_char.c char.h rs-common.h + +init_rs_int.o: init_rs_int.c int.h rs-common.h + +ccsds_tab.o: ccsds_tab.c + +ccsds_tab.c: gen_ccsds + ./gen_ccsds > ccsds_tab.c + +gen_ccsds: gen_ccsds.o init_rs_char.o + gcc $(CFLAGS) -o $@ $^ + +gen_ccsds.o: gen_ccsds.c + gcc $(CFLAGS) -c -o $@ $< + +ccsds_tal.o: ccsds_tal.c + +ccsds_tal.c: gen_ccsds_tal + ./gen_ccsds_tal > ccsds_tal.c + +exercise_char.o: exercise.c + gcc $(CFLAGS) -c -o $@ $< + +exercise_int.o: exercise.c + gcc -DBIGSYM=1 $(CFLAGS) -c -o $@ $< + +exercise_8.o: exercise.c + gcc -DFIXED=1 $(CFLAGS) -c -o $@ $< + +exercise_ccsds.o: exercise.c + gcc -DCCSDS=1 $(CFLAGS) -c -o $@ $< + +viterbi27.o: viterbi27.c fec.h + +viterbi27_port.o: viterbi27_port.c fec.h + +viterbi27_av.o: viterbi27_av.c fec.h + +viterbi27_mmx.o: viterbi27_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi27_sse.o: viterbi27_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi27_sse2.o: viterbi27_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +viterbi29.o: viterbi29.c fec.h + +viterbi29_port.o: viterbi29_port.c fec.h + +viterbi29_av.o: viterbi29_av.c fec.h + +viterbi29_mmx.o: viterbi29_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi29_sse.o: viterbi29_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi29_sse2.o: viterbi29_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +viterbi39.o: viterbi39.c fec.h + +viterbi39_port.o: viterbi39_port.c fec.h + +viterbi39_av.o: viterbi39_av.c fec.h + +viterbi39_mmx.o: viterbi39_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi39_sse.o: viterbi39_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi39_sse2.o: viterbi39_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +viterbi615.o: viterbi615.c fec.h + +viterbi615_port.o: viterbi615_port.c fec.h + +viterbi615_av.o: viterbi615_av.c fec.h + +viterbi615_mmx.o: viterbi615_mmx.c fec.h + gcc $(CFLAGS) -mmmx -c -o $@ $< + +viterbi615_sse.o: viterbi615_sse.c fec.h + gcc $(CFLAGS) -msse -c -o $@ $< + +viterbi615_sse2.o: viterbi615_sse2.c fec.h + gcc $(CFLAGS) -msse2 -c -o $@ $< + +cpu_mode_x86.o: cpu_mode_x86.c fec.h + +cpu_mode_x86_64.o: cpu_mode_x86_64.c fec.h + +cpu_mode_ppc.o: cpu_mode_ppc.c fec.h + +#%.o: %.s +# $(AS) $< -o $@ + + + +clean: + rm -f *.o $(SHARED_LIB) *.a rs_speedtest peaktest sumsq_test dtest vtest27 vtest29 vtest39 vtest615 rstest ccsds_tab.c ccsds_tal.c gen_ccsds gen_ccsds_tal core + rm -rf autom4te.cache + +distclean: clean + rm -f config.log config.cache config.status config.h makefile + diff --git a/libfec/mmxbfly27.s b/libfec/mmxbfly27.s new file mode 100644 index 0000000..4abbf48 --- /dev/null +++ b/libfec/mmxbfly27.s @@ -0,0 +1,148 @@ +/* Intel SIMD MMX implementation of Viterbi ACS butterflies + for 64-state (k=7) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + int update_viterbi27_blk_mmx(struct v27 *vp,unsigned char *syms,int nbits) ; +*/ + # MMX (64-bit SIMD) version + # requires Pentium-MMX, Pentium-II or better + + # These are offsets into struct v27, defined in viterbi27_mmx.c + .set DP,128 + .set OLDMETRICS,132 + .set NEWMETRICS,136 + .text + .global update_viterbi27_blk_mmx,Mettab27_1,Mettab27_2 + .type update_viterbi27_blk_mmx,@function + .align 16 + +update_viterbi27_blk_mmx: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + movl 12(%ebp),%ebx # ebx = syms + movw (%ebx),%ax # ax = second symbol : first symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + movb %ah,%bl + andl $255,%eax + andl $255,%ebx + + # shift into first array index dimension slot + shll $5,%eax + shll $5,%ebx + + # each invocation of this macro will do 8 butterflies in parallel + .MACRO butterfly GROUP + # Compute branch metrics + movq (Mettab27_1+8*\GROUP)(%eax),%mm3 + movq fifteens,%mm0 + + paddb (Mettab27_2+8*\GROUP)(%ebx),%mm3 + paddb ones,%mm3 # emulate pavgb - this may not be necessary + psrlq $1,%mm3 + pand %mm0,%mm3 + + movq (8*\GROUP)(%esi),%mm6 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+32)(%esi),%mm2 # Incoming path metric, high bit = 1 + movq %mm6,%mm1 + movq %mm2,%mm7 + + paddb %mm3,%mm6 + paddb %mm3,%mm2 + pxor %mm0,%mm3 # invert branch metric + paddb %mm3,%mm7 # path metric for inverted symbols + paddb %mm3,%mm1 + + # live registers 1 2 6 7 + # Compare mm6 and mm7; mm1 and mm2 + pxor %mm3,%mm3 + movq %mm6,%mm4 + movq %mm1,%mm5 + psubb %mm7,%mm4 # mm4 = mm6 - mm7 + psubb %mm2,%mm5 # mm5 = mm1 - mm2 + pcmpgtb %mm3,%mm4 # mm4 = first set of decisions (ff = 1 better) + pcmpgtb %mm3,%mm5 # mm5 = second set of decisions + + # live registers 1 2 4 5 6 7 + # select survivors + movq %mm4,%mm0 + pand %mm4,%mm7 + movq %mm5,%mm3 + pand %mm5,%mm2 + pandn %mm6,%mm0 + pandn %mm1,%mm3 + por %mm0,%mm7 # mm7 = first set of survivors + por %mm3,%mm2 # mm2 = second set of survivors + + # live registers 2 4 5 7 + # interleave & store decisions in mm4, mm5 + # interleave & store new branch metrics in mm2, mm7 + movq %mm4,%mm3 + movq %mm7,%mm0 + punpckhbw %mm5,%mm4 + punpcklbw %mm5,%mm3 + punpcklbw %mm2,%mm7 # interleave second 8 new metrics + punpckhbw %mm2,%mm0 # interleave first 8 new metrics + movq %mm4,(16*\GROUP+8)(%edx) + movq %mm3,(16*\GROUP)(%edx) + movq %mm7,(16*\GROUP)(%edi) + movq %mm0,(16*\GROUP+8)(%edi) + + .endm + +# invoke macro 4 times for a total of 32 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + butterfly GROUP=2 + butterfly GROUP=3 + + addl $64,%edx # bump decision pointer + + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 8 +fifteens: + .byte 15,15,15,15,15,15,15,15 + + .align 8 +ones: .byte 1,1,1,1,1,1,1,1 diff --git a/libfec/mmxbfly29.s b/libfec/mmxbfly29.s new file mode 100644 index 0000000..e37cab8 --- /dev/null +++ b/libfec/mmxbfly29.s @@ -0,0 +1,161 @@ +/* Intel SIMD MMX implementation of Viterbi ACS butterflies + for 256-state (k=9) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi29_blk_mmx(struct v29 *vp,unsigned char *syms,int nbits); +*/ + + # These are offsets into struct v29, defined in viterbi29.h + .set DP,512 + .set OLDMETRICS,516 + .set NEWMETRICS,520 + .text + .global update_viterbi29_blk_mmx,Mettab29_1,Mettab29_2 + .type update_viterbi29_blk_mmx,@function + .align 16 + + # MMX (64-bit SIMD) version + # requires Pentium-MMX, Pentium-II or better + +update_viterbi29_blk_mmx: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + movl 12(%ebp),%ebx # ebx = syms + movw (%ebx),%ax # ax = second symbol : first symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + movb %ah,%bl + andl $255,%eax + andl $255,%ebx + + # shift into first array index dimension slot + shll $7,%eax + shll $7,%ebx + + # each invocation of this macro will do 8 butterflies in parallel + .MACRO butterfly GROUP + # Compute branch metrics + movq (Mettab29_1+8*\GROUP)(%eax),%mm3 + movq fifteens,%mm0 + paddb (Mettab29_2+8*\GROUP)(%ebx),%mm3 + paddb ones,%mm3 # emulate pavgb - this may not be necessary + psrlq $1,%mm3 + pand %mm0,%mm3 + + movq (8*\GROUP)(%esi),%mm6 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+128)(%esi),%mm2 # Incoming path metric, high bit = 1 + movq %mm6,%mm1 + movq %mm2,%mm7 + + paddb %mm3,%mm6 + paddb %mm3,%mm2 + pxor %mm0,%mm3 # invert branch metric + paddb %mm3,%mm7 # path metric for inverted symbols + paddb %mm3,%mm1 + + # live registers 1 2 6 7 + # Compare mm6 and mm7; mm1 and mm2 + pxor %mm3,%mm3 + movq %mm6,%mm4 + movq %mm1,%mm5 + psubb %mm7,%mm4 # mm4 = mm6 - mm7 + psubb %mm2,%mm5 # mm5 = mm1 - mm2 + pcmpgtb %mm3,%mm4 # mm4 = first set of decisions (ff = 1 better) + pcmpgtb %mm3,%mm5 # mm5 = second set of decisions + + # live registers 1 2 4 5 6 7 + # select survivors + movq %mm4,%mm0 + pand %mm4,%mm7 + movq %mm5,%mm3 + pand %mm5,%mm2 + pandn %mm6,%mm0 + pandn %mm1,%mm3 + por %mm0,%mm7 # mm7 = first set of survivors + por %mm3,%mm2 # mm2 = second set of survivors + + # live registers 2 4 5 7 + # interleave & store decisions in mm4, mm5 + # interleave & store new branch metrics in mm2, mm7 + movq %mm4,%mm3 + movq %mm7,%mm0 + punpckhbw %mm5,%mm4 + punpcklbw %mm5,%mm3 + punpcklbw %mm2,%mm7 # interleave second 8 new metrics + punpckhbw %mm2,%mm0 # interleave first 8 new metrics + movq %mm4,(16*\GROUP+8)(%edx) + movq %mm3,(16*\GROUP)(%edx) + movq %mm7,(16*\GROUP)(%edi) + movq %mm0,(16*\GROUP+8)(%edi) + + .endm + +# invoke macro 16 times for a total of 128 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + butterfly GROUP=2 + butterfly GROUP=3 + butterfly GROUP=4 + butterfly GROUP=5 + butterfly GROUP=6 + butterfly GROUP=7 + butterfly GROUP=8 + butterfly GROUP=9 + butterfly GROUP=10 + butterfly GROUP=11 + butterfly GROUP=12 + butterfly GROUP=13 + butterfly GROUP=14 + butterfly GROUP=15 + + addl $256,%edx # bump decision pointer + + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 8 +fifteens: + .byte 15,15,15,15,15,15,15,15 + + .align 8 +ones: .byte 1,1,1,1,1,1,1,1 diff --git a/libfec/peak_mmx_assist.s b/libfec/peak_mmx_assist.s new file mode 100644 index 0000000..dae831f --- /dev/null +++ b/libfec/peak_mmx_assist.s @@ -0,0 +1,70 @@ +# MMX assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak value in signed 16-bit input samples +# int peakval_mmx(signed short *in,int cnt); + .global peakval_mmx + .type peakval_mmx,@function + .align 16 +peakval_mmx: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + pushl %ebx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + movq %mm7,%mm6 # copy previous peak + pcmpgtw %mm0,%mm6 # ff == old peak greater + pand %mm6,%mm7 # select old peaks that are greater + pandn %mm0,%mm6 # select new values that are greater + por %mm6,%mm7 + + addl $8,%esi + jmp 1b + +2: movd %mm7,%eax + psrlq $16,%mm7 + andl $0xffff,%eax + + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 3f + movl %edx,%eax +3: + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 4f + movl %edx,%eax +4: + movd %mm7,%edx + andl $0xffff,%edx + cmpl %edx,%eax + jnl 5f + movl %edx,%eax +5: + emms + popl %ebx + popl %ecx + popl %esi + popl %ebp + ret + diff --git a/libfec/peak_sse2_assist.s b/libfec/peak_sse2_assist.s new file mode 100644 index 0000000..1dee3a8 --- /dev/null +++ b/libfec/peak_sse2_assist.s @@ -0,0 +1,51 @@ +# SSE2 assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Public License (GPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse2(signed short *in,int cnt); + .global peakval_sse2 + .type peakval_sse2,@function + .align 16 +peakval_sse2: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %xmm7,%xmm7 # clear peak + +1: subl $8,%ecx + jl 2f + movaps (%esi),%xmm0 + movaps %xmm0,%xmm1 + psraw $15,%xmm1 # xmm1 = 1's if negative, 0's if positive + pxor %xmm1,%xmm0 # complement negatives + psubw %xmm1,%xmm0 # add 1 to negatives + pmaxsw %xmm0,%xmm7 # store peak + + addl $16,%esi + jmp 1b + +2: movaps %xmm7,%xmm0 + psrldq $8,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $32,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $16,%xmm0 + pmaxsw %xmm0,%xmm7 # min value in low word of %xmm7 + + movd %xmm7,%eax + andl $0xffff,%eax + + popl %ecx + popl %esi + popl %ebp + ret diff --git a/libfec/peak_sse_assist.s b/libfec/peak_sse_assist.s new file mode 100644 index 0000000..ea6fce8 --- /dev/null +++ b/libfec/peak_sse_assist.s @@ -0,0 +1,49 @@ +# SSE assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse(signed short *in,int cnt); + .global peakval_sse + .type peakval_sse,@function + .align 16 +peakval_sse: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + pmaxsw %mm0,%mm7 # store peak + + addl $8,%esi + jmp 1b + +2: movq %mm7,%mm0 + psrlq $32,%mm0 + pmaxsw %mm0,%mm7 + movq %mm7,%mm0 + psrlq $16,%mm0 + pmaxsw %mm0,%mm7 # min value in low word of %mm7 + + movd %mm7,%eax + andl $0xffff,%eax + + emms + popl %ecx + popl %esi + popl %ebp + ret diff --git a/libfec/peaktest.c b/libfec/peaktest.c new file mode 100644 index 0000000..fa4b280 --- /dev/null +++ b/libfec/peaktest.c @@ -0,0 +1,38 @@ +/* Verify correctness of the peak routine + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include +#include + +/* These values should trigger leading/trailing array fragment handling */ +#define NSAMP 200002 +#define OFFSET 1 + +int peakval(signed short *,int); +int peakval_port(signed short *,int); + +int main(){ + int i,s; + int result,rresult; + signed short samples[NSAMP]; + + srandom(time(NULL)); + + for(i=0;i +#include "fec.h" + +int peakval_port(signed short *b,int cnt); +#ifdef __i386__ +int peakval_mmx(signed short *b,int cnt); +int peakval_sse(signed short *b,int cnt); +int peakval_sse2(signed short *b,int cnt); +#endif + +#ifdef __x86_64__ +int peakval_sse2(signed short *b,int cnt); +#endif + +#ifdef __VEC__ +int peakval_av(signed short *b,int cnt); +#endif + +int peakval(signed short *b,int cnt){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return peakval_port(b,cnt); +#ifdef __i386__ + case MMX: + return peakval_mmx(b,cnt); + case SSE: + return peakval_sse(b,cnt); + case SSE2: + return peakval_sse2(b,cnt); +#endif + +#ifdef __x86_64__ + case SSE2: + return peakval_port(b,cnt); + //return peakval_sse2(b,cnt); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return peakval_av(b,cnt); +#endif + } +} diff --git a/libfec/peakval_av.c b/libfec/peakval_av.c new file mode 100644 index 0000000..ae54c10 --- /dev/null +++ b/libfec/peakval_av.c @@ -0,0 +1,61 @@ +/* Return the largest absolute value of a vector of signed shorts + + * This is the Altivec SIMD version. + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include "fec.h" + +signed short peakval_av(signed short *in,int cnt){ + vector signed short x; + int pad; + union { vector signed char cv; vector signed short hv; signed short s[8]; signed char c[16];} s; + vector signed short smallest,largest; + + smallest = (vector signed short)(0); + largest = (vector signed short)(0); + if((pad = (int)in & 15)!=0){ + /* Load unaligned leading word */ + x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in)); + if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */ + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + } + smallest = vec_min(smallest,x); + largest = vec_max(largest,x); + in += 8-pad/2; + cnt -= 8-pad/2; + } + /* Everything is now aligned, rip through most of the block */ + while(cnt >= 8){ + x = vec_ld(0,in); + smallest = vec_min(smallest,x); + largest = vec_max(largest,x); + in += 8; + cnt -= 8; + } + /* Handle trailing fragment, if any */ + if(cnt > 0){ + x = vec_ld(0,in); + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + smallest = vec_min(smallest,x); + largest = vec_max(largest,x); + } + /* Combine and extract result */ + largest = vec_max(largest,vec_abs(smallest)); + + s.c[15] = 64; /* Shift right four 16-bit words */ + largest = vec_max(largest,vec_sro(largest,s.cv)); + + s.c[15] = 32; /* Shift right two 16-bit words */ + largest = vec_max(largest,vec_sro(largest,s.cv)); + + s.c[15] = 16; /* Shift right one 16-bit word */ + largest = vec_max(largest,vec_sro(largest,s.cv)); + + s.hv = largest; + return s.s[7]; +} diff --git a/libfec/peakval_mmx.c b/libfec/peakval_mmx.c new file mode 100644 index 0000000..436fe88 --- /dev/null +++ b/libfec/peakval_mmx.c @@ -0,0 +1,34 @@ +/* Wrapper for the MMX version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ + +#include + +int peakval_mmx_assist(signed short *,int); + +int peakval_mmx(signed short *b,int cnt){ + int peak = 0; + int a; + + while(((int)b & 7) != 0 && cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + a = peakval_mmx_assist(b,cnt); + if(a > peak) + peak = a; + b += cnt & ~3; + cnt &= 3; + + while(cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + return peak; +} diff --git a/libfec/peakval_mmx_assist.s b/libfec/peakval_mmx_assist.s new file mode 100644 index 0000000..553cb79 --- /dev/null +++ b/libfec/peakval_mmx_assist.s @@ -0,0 +1,70 @@ +# MMX assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak value in signed 16-bit input samples +# int peakval_mmx_assist(signed short *in,int cnt); + .global peakval_mmx_assist + .type peakval_mmx_assist,@function + .align 16 +peakval_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + pushl %ebx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + movq %mm7,%mm6 # copy previous peak + pcmpgtw %mm0,%mm6 # ff == old peak greater + pand %mm6,%mm7 # select old peaks that are greater + pandn %mm0,%mm6 # select new values that are greater + por %mm6,%mm7 + + addl $8,%esi + jmp 1b + +2: movd %mm7,%eax + psrlq $16,%mm7 + andl $0xffff,%eax + + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 3f + movl %edx,%eax +3: + movd %mm7,%edx + psrlq $16,%mm7 + andl $0xffff,%edx + cmpl %edx,%eax + jnl 4f + movl %edx,%eax +4: + movd %mm7,%edx + andl $0xffff,%edx + cmpl %edx,%eax + jnl 5f + movl %edx,%eax +5: + emms + popl %ebx + popl %ecx + popl %esi + popl %ebp + ret + diff --git a/libfec/peakval_port.c b/libfec/peakval_port.c new file mode 100644 index 0000000..07ab316 --- /dev/null +++ b/libfec/peakval_port.c @@ -0,0 +1,16 @@ +/* Portable C version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include "fec.h" +int peakval_port(signed short *b,int len){ + int peak = 0; + int a,i; + + for(i=0;i peak) + peak = a; + } + return peak; +} diff --git a/libfec/peakval_sse.c b/libfec/peakval_sse.c new file mode 100644 index 0000000..9868b7f --- /dev/null +++ b/libfec/peakval_sse.c @@ -0,0 +1,35 @@ +/* IA-32 SSE version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ + +#include +#include "fec.h" + +int peakval_sse_assist(signed short *,int); + +int peakval_sse(signed short *b,int cnt){ + int peak = 0; + int a; + + while(((int)b & 7) != 0 && cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + a = peakval_sse_assist(b,cnt); + if(a > peak) + peak = a; + b += cnt & ~3; + cnt &= 3; + + while(cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + return peak; +} diff --git a/libfec/peakval_sse2.c b/libfec/peakval_sse2.c new file mode 100644 index 0000000..79d9059 --- /dev/null +++ b/libfec/peakval_sse2.c @@ -0,0 +1,34 @@ +/* Portable C version of peakval + * Copyright 2004 Phil Karn, KA9Q + */ +#include +#include "fec.h" + +int peakval_sse2_assist(signed short *,int); + +int peakval_sse2(signed short *b,int cnt){ + int peak = 0; + int a; + + while(((int)b & 15) != 0 && cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + a = peakval_sse2_assist(b,cnt); + if(a > peak) + peak = a; + b += cnt & ~7; + cnt &= 7; + + while(cnt != 0){ + a = abs(*b); + if(a > peak) + peak = a; + b++; + cnt--; + } + return peak; +} diff --git a/libfec/peakval_sse2_assist.s b/libfec/peakval_sse2_assist.s new file mode 100644 index 0000000..c7a58e7 --- /dev/null +++ b/libfec/peakval_sse2_assist.s @@ -0,0 +1,51 @@ +# SSE2 assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse2_assist(signed short *in,int cnt); + .global peakval_sse2_assist + .type peakval_sse2_assist,@function + .align 16 +peakval_sse2_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %xmm7,%xmm7 # clear peak + +1: subl $8,%ecx + jl 2f + movaps (%esi),%xmm0 + movaps %xmm0,%xmm1 + psraw $15,%xmm1 # xmm1 = 1's if negative, 0's if positive + pxor %xmm1,%xmm0 # complement negatives + psubw %xmm1,%xmm0 # add 1 to negatives + pmaxsw %xmm0,%xmm7 # store peak + + addl $16,%esi + jmp 1b + +2: movaps %xmm7,%xmm0 + psrldq $8,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $32,%xmm0 + pmaxsw %xmm0,%xmm7 + movaps %xmm7,%xmm0 + psrlq $16,%xmm0 + pmaxsw %xmm0,%xmm7 # min value in low word of %xmm7 + + movd %xmm7,%eax + andl $0xffff,%eax + + popl %ecx + popl %esi + popl %ebp + ret diff --git a/libfec/peakval_sse_assist.s b/libfec/peakval_sse_assist.s new file mode 100644 index 0000000..827c800 --- /dev/null +++ b/libfec/peakval_sse_assist.s @@ -0,0 +1,49 @@ +# SSE assist routines for peakval +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Lesser General Public License (LGPL) + + .text + +# Find peak absolute value in signed 16-bit input samples +# int peakval_sse_assist(signed short *in,int cnt); + .global peakval_sse_assist + .type peakval_sse_assist,@function + .align 16 +peakval_sse_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + + pxor %mm7,%mm7 # clear peak + +1: subl $4,%ecx + jl 2f + movq (%esi),%mm0 + movq %mm0,%mm1 + psraw $15,%mm1 # mm1 = 1's if negative, 0's if positive + pxor %mm1,%mm0 # complement negatives + psubw %mm1,%mm0 # add 1 to negatives + pmaxsw %mm0,%mm7 # store peak + + addl $8,%esi + jmp 1b + +2: movq %mm7,%mm0 + psrlq $32,%mm0 + pmaxsw %mm0,%mm7 + movq %mm7,%mm0 + psrlq $16,%mm0 + pmaxsw %mm0,%mm7 # min value in low word of %mm7 + + movd %mm7,%eax + andl $0xffff,%eax + + emms + popl %ecx + popl %esi + popl %ebp + ret diff --git a/libfec/rs-common.h b/libfec/rs-common.h new file mode 100644 index 0000000..e64eb39 --- /dev/null +++ b/libfec/rs-common.h @@ -0,0 +1,26 @@ +/* Stuff common to all the general-purpose Reed-Solomon codecs + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +/* Reed-Solomon codec control block */ +struct rs { + int mm; /* Bits per symbol */ + int nn; /* Symbols per block (= (1<= rs->nn) { + x -= rs->nn; + x = (x >> rs->mm) + (x & rs->nn); + } + return x; +} diff --git a/libfec/rs.3 b/libfec/rs.3 new file mode 100644 index 0000000..5d71503 --- /dev/null +++ b/libfec/rs.3 @@ -0,0 +1,198 @@ +.TH REED-SOLOMON 3 +.SH NAME +init_rs_int, encode_rs_int, decode_rs_int, free_rs_int, +init_rs_char, encode_rs_char, decode_rs_char, free_rs_char, +encode_rs_8, decode_rs_8, encode_rs_ccsds, decode_rs_ccsds +\- Reed-Solomon encoding/decoding +.SH SYNOPSIS +.nf +.ft B +#include "fec.h" + +void *init_rs_int(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad); + +void encode_rs_int(void *rs,int *data,int *parity); + +int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras); + +void free_rs_int(void *rs); + + +void *init_rs_char(int symsize,int gfpoly,int fcr,int prim, + int nroots,int pad); + +void encode_rs_char(void *rs,unsigned char *data, + unsigned char *parity); + +int decode_rs_char(void *rs,unsigned char *data,int *eras_pos, + int no_eras); + +void free_rs_char(void *rs); + + +void encode_rs_8(unsigned char *data,unsigned char *parity, + int pad); + +int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras, + int pad); + + +void encode_rs_ccsds(unsigned char *data,unsigned char *parity, + int pad); + +int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras, + int pad); + +unsigned char Taltab[256]; +unsigned char Tal1tab[256]; + +.fi + +.SH DESCRIPTION +These functions implement Reed-Solomon error control encoding and +decoding. For optimal performance in a variety of applications, three +sets of functions are supplied. To access these functions, add "-lfec" +to your linker command line. + +The functions with names ending in \fB_int\fR handle data in integer arrays, +permitting arbitrarily large codewords limited only by machine +resources. + +The functions with names ending in \fB_char\fR take unsigned char arrays and can +handle codes with symbols of 8 bits or less (i.e., with codewords of +255 symbols or less). + +\fBencode_rs_8\fR and \fBdecode_rs_8\fR implement a specific +(255,223) code with 8-bit symbols specified by the CCSDS: +a field generator of 1 + X + X^2 + X^7 + X^8 and a code +generator with first consecutive root = 112 and a primitive element of +11. These functions use the conventional +polynomial form, \fInot\fR the dual-basis specified in +the CCSDS standard, to represent symbols. This code may be +shortened by giving a non-zero \fBpad\fR value to produce a +(255-\fBpad\fR,223-\fBpad\fR) code. The padding will consist of the +specified number of zeroes at the front of the full codeword. + +For full CCSDS compatibility, \fBencode_rs_ccsds\fR and +\fBdecode_rs_ccsds\fR are provided. These functions use two lookup +tables, \fBTaltab\fR to convert from conventional to dual-basis, and +\fBTal1tab\fR to perform the inverse mapping from dual-basis to +conventional form, before and after calls to \fBencode_rs_8\fR +and \fBdecode_rs_8\fR. + +The \fB_8\fR and \fB_ccsds\fR functions do not require initialization. + +To use the general purpose RS encoder or decoder (i.e., +the \fB_char\fR or \fB_int\fR versions), the user must first +call \fBinit_rs_int\fR or \fBinit_rs_char\fR as appropriate. The +arguments are as follows: + +\fBsymsize\fR gives the symbol size in bits, up to 8 for \fBinit_rs_char\fR +or 32 for \fBinit_rs_int\fR on a machine with 32-bit ints (though such a +huge code would exhaust memory limits on a 32-bit machine). The resulting +Reed-Solomon code word will have 2^\fBsymsize\fR - 1 symbols, +each containing \fBsymsize\fR bits. The codeword may be shortened with the +\fBpad\fR parameter described below. + +\fBgfpoly\fR gives the extended Galois field generator polynomial coefficients, +with the 0th coefficient in the low order bit. The polynomial +\fImust\fR be primitive; if not, the call will fail and NULL will be +returned. + +\fBfcr\fR gives, in index form, the first consecutive root of the +Reed Solomon code generator polynomial. + +\fBprim\fR gives, in index form, the primitive element in the Galois field +used to generate the Reed Solomon code generator polynomial. + +\fBnroots\fR gives the number of roots in the Reed Solomon code +generator polynomial. This equals the number of parity symbols +per code block. + +\fBpad\fR gives the number of leading symbols in the codeword +that are implicitly padded to zero in a shortened code block. + +The resulting Reed-Solomon code has parameters (N,K), where +N = 2^\fBsymsize\fR - \fBpad\fR - 1 and K = N-\fBnroots\fR. + +The \fBencode_rs_char\fR and \fBencode_rs_int\fR functions accept +the pointer returned by \fBinit_rs_char\fR or +\fBinit_rs_int\fR, respectively, to +encode a block of data using the specified code. +The input data array is expected to +contain K symbols (of \fBsymsize\fR bits each, right justified +in each char or int) and \fBnroots\fR parity symbols will be placed +into the \fBparity\fR array, right justified. + +The \fBdecode_\fR functions correct +the errors in a Reed-Solomon codeword of N symbols up to the capability of the code. +An optional list of "erased" symbol indices may be given in the \fBeras_pos\fR +array to assist the decoder; this parameter may be NULL if no erasures +are given. The number of erased symbols must be given in the \fBno_eras\fR +parameter. + +To maximize performance, the encode and decode functions perform no +"sanity checking" of their inputs. Decoder failure may result if +\fBeras_pos\fR contains duplicate entries, and both encoder and +decoder will fail if an input symbol exceeds its allowable range. +(Symbol range overflow cannot occur with the \fB_8\fR or +\fB_ccsds\fR functions, +or with the \fB_char\fR functions when 8-bit symbols are specified.) + +The decoder corrects the symbols "in place", returning the number +of symbols in error. If the codeword is uncorrectable, -1 is returned +and the data block is unchanged. If \fBeras_pos\fR is non-null, it is +used to return a list of corrected symbol positions, in no particular +order. This means that the +array passed through this parameter \fImust\fR have at least \fBnroots\fR +elements to prevent a possible buffer overflow. + +The \fBfree_rs_int\fR and \fBfree_rs_char\fR functions free the internal +space allocated by the \fBinit_rs_int\fR and \fBinit_rs_char\fR functions, +respecitively. + +The functions \fBencode_rs_8\fR and \fBdecode_rs_8\fR do not have +corresponding \fBinit\fR and \fBfree\fR, nor do they take the +\fBrs\fR argument accepted by the other functions as their parameters +are statically compiled. These functions implement a code +equivalent to calling + +\fBinit_rs_char\fR(8,0x187,112,11,32,pad); + +and using the resulting pointer with \fBencode_rs_char\fR and +\fBdecode_rs_char\fR. + +.SH RETURN VALUES +\fBinit_rs_int\fR and \fBinit_rs_char\fR return a pointer to an internal +control structure that must be passed to the corresponding encode, decode +and free functions. These functions return NULL on error. + +The \fBdecode_\fR functions return a count of corrected +symbols, or -1 if the block was uncorrectible. + +.SH AUTHOR +Phil Karn, KA9Q (karn@ka9q.net), based heavily on earlier work by Robert +Morelos-Zaragoza (robert@spectra.eng.hawaii.edu) and Hari Thirumoorthy +(harit@spectra.eng.hawaii.edu). Extra improvements suggested by Detmar +Welz (dwelz@web.de). + +.SH COPYRIGHT +Copyright 2004, Phil Karn, KA9Q. May be used under the terms of the +GNU Lesser General Public License (LGPL). + +.SH SEE ALSO +CCSDS 101.0-B-6: Telemetry Channel Coding. +http://www.ccsds.org/documents/101x0b6.pdf + +.SH NOTE +CCSDS chose the "dual basis" symbol representation because it +simplified the implementation of a Reed-Solomon encoder in dedicated +hardware. However, this approach holds no advantages for a software +implementation on a general purpose computer, so use of the dual basis +is recommended only if compatibility with the CCSDS standard is needed, +e.g., to decode data from an existing spacecraft using the CCSDS +standard. If you just want a fast (255,223) RS codec without needing +to interoperate with a CCSDS standard code, use \fBencode_rs_8\fR +and \fBdecode_rs_8\fR. + diff --git a/libfec/rs_speedtest.c b/libfec/rs_speedtest.c new file mode 100644 index 0000000..225f160 --- /dev/null +++ b/libfec/rs_speedtest.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include +#include +#include "fec.h" + +int main(){ + unsigned char block[255]; + int i; + void *rs; + struct rusage start,finish; + double extime; + int trials = 10000; + + for(i=0;i<223;i++) + block[i] = 0x01; + + rs = init_rs_char(8,0x187,112,11,32,0); + encode_rs_char(rs,block,&block[223]); + + getrusage(RUSAGE_SELF,&start); + for(i=0;i +#include +#include +#include +#include "fec.h" + + +struct etab { + int symsize; + int genpoly; + int fcs; + int prim; + int nroots; + int ntrials; +} Tab[] = { + {2, 0x7, 1, 1, 1, 10 }, + {3, 0xb, 1, 1, 2, 10 }, + {4, 0x13, 1, 1, 4, 10 }, + {5, 0x25, 1, 1, 6, 10 }, + {6, 0x43, 1, 1, 8, 10 }, + {7, 0x89, 1, 1, 10, 10 }, + {8, 0x11d, 1, 1, 32, 10 }, + {8, 0x187, 112,11, 32, 10 }, /* Duplicates CCSDS codec */ + {9, 0x211, 1, 1, 32, 10 }, + {10,0x409, 1, 1, 32, 10 }, + {11,0x805, 1, 1, 32, 10 }, + {12,0x1053, 1, 1, 32, 5 }, + {13,0x201b, 1, 1, 32, 2 }, + {14,0x4443, 1, 1, 32, 1 }, + {15,0x8003, 1, 1, 32, 1 }, + {16,0x1100b, 1, 1, 32, 1 }, + {0, 0, 0, 0, 0}, +}; + +int exercise_char(struct etab *e); +int exercise_int(struct etab *e); +int exercise_8(void); + +int main(){ + int i; + + srandom(time(NULL)); + + printf("Testing fixed CCSDS encoder...\n"); + exercise_8(); + for(i=0;Tab[i].symsize != 0;i++){ + int nn,kk; + + nn = (1<symsize) - 1; + unsigned char block[nn],tblock[nn]; + int errlocs[nn],derrlocs[nn]; + int i; + int errors; + int derrors,kk; + int errval,errloc; + int erasures; + int decoder_errors = 0; + void *rs; + + if(e->symsize > 8) + return -1; + + /* Compute code parameters */ + kk = nn - e->nroots; + + rs = init_rs_char(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0); + if(rs == NULL){ + printf("init_rs_char failed!\n"); + return -1; + } + /* Test up to the error correction capacity of the code */ + for(errors=0;errors <= e->nroots/2;errors++){ + + /* Load block with random data and encode */ + for(i=0;isymsize) - 1; + int block[nn],tblock[nn]; + int errlocs[nn],derrlocs[nn]; + int i; + int errors; + int derrors,kk; + int errval,errloc; + int erasures; + int decoder_errors = 0; + void *rs; + + /* Compute code parameters */ + kk = nn - e->nroots; + + rs = init_rs_int(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0); + if(rs == NULL){ + printf("init_rs_int failed!\n"); + return -1; + } + /* Test up to the error correction capacity of the code */ + for(errors=0;errors <= e->nroots/2;errors++){ + + /* Load block with random data and encode */ + for(i=0;i +#include +#include "fec.h" + +#define MAX_RANDOM 0x7fffffff + +/* Generate gaussian random double with specified mean and std_dev */ +double normal_rand(double mean, double std_dev) +{ + double fac,rsq,v1,v2; + static double gset; + static int iset; + + if(iset){ + /* Already got one */ + iset = 0; + return mean + std_dev*gset; + } + /* Generate two evenly distributed numbers between -1 and +1 + * that are inside the unit circle + */ + do { + v1 = 2.0 * (double)random() / MAX_RANDOM - 1; + v2 = 2.0 * (double)random() / MAX_RANDOM - 1; + rsq = v1*v1 + v2*v2; + } while(rsq >= 1.0 || rsq == 0.0); + fac = sqrt(-2.0*log(rsq)/rsq); + gset = v1*fac; + iset++; + return mean + std_dev*v2*fac; +} + +unsigned char addnoise(int sym,double amp,double gain,double offset,int clip){ + int sample; + + sample = offset + gain*normal_rand(sym?amp:-amp,1.0); + /* Clip to 8-bit offset range */ + if(sample < 0) + sample = 0; + else if(sample > clip) + sample = clip; + return sample; +} diff --git a/libfec/simd-viterbi.3 b/libfec/simd-viterbi.3 new file mode 100644 index 0000000..4c67593 --- /dev/null +++ b/libfec/simd-viterbi.3 @@ -0,0 +1,247 @@ +.TH SIMD-VITERBI 3 +.SH NAME +create_viterbi27, set_viterbi27_polynomial, init_viterbi27, update_viterbi27_blk, +chainback_viterbi27, delete_viterbi27, +create_viterbi29, set_viterbi_29_polynomial, init_viterbi29, update_viterbi29_blk, +chainback_viterbi29, delete_viterbi29, +create_viterbi39, set_viterbi_39_polynomial, init_viterbi39, update_viterbi39_blk, +chainback_viterbi39, delete_viterbi39, +create_viterbi615, set_viterbi615_polynomial, init_viterbi615, update_viterbi615_blk, +chainback_viterbi615, delete_viterbi615 -\ IA32 SIMD-assisted Viterbi decoders +.SH SYNOPSIS +.nf +.ft B +#include "fec.h" +void *create_viterbi27(int blocklen); +void set_viterbi27_polynomial(int polys[2]); +int init_viterbi27(void *vp,int starting_state); +int update_viterbi27_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi27(void *vp); +.fi +.sp +.nf +.ft B +void *create_viterbi29(int blocklen); +void set_viterbi29_polynomial(int polys[2]); +int init_viterbi29(void *vp,int starting_state); +int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi29(void *vp); +.fi +.sp +.nf +.ft B +void *create_viterbi39(int blocklen); +void set_viterbi39_polynomial(int polys[3]); +int init_viterbi39(void *vp,int starting_state); +int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi39(void *vp); +.fi +.sp +.nf +.ft B +void *create_viterbi615(int blocklen); +void set_viterbi615_polynomial(int polys[6]); +int init_viterbi615(void *vp,int starting_state); +int update_viterbi615_blk(void *vp,unsigned char syms[],int nbits); +int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate); +void delete_viterbi615(void *vp); +.fi +.SH DESCRIPTION +These functions implement high performance Viterbi decoders for four +convolutional codes: a rate 1/2 constraint length 7 (k=7) code +("viterbi27"), a rate 1/2 k=9 code ("viterbi29"), +a rate 1/3 k=9 code ("viterbi39") and a rate 1/6 k=15 code ("viterbi615"). +The decoders use the Intel IA32 or PowerPC SIMD instruction sets, if available, to improve +decoding speed. + +On the IA32 there are three different SIMD instruction sets. The first +and most common is MMX, introduced on later Intel Pentiums and then on +the Intel Pentium II and most Intel clones (AMD K6, Transmeta Crusoe, +etc). SSE was introduced on the Pentium III and later implemented in +the AMD Athlon 4 (AMD calls it "3D Now! Professional"). Most +recently, SSE2 was introduced in the Intel Pentium 4, and has been +adopted by more recent AMD CPUs. The presence of SSE2 implies the +existence of SSE, which in turn implies MMX. + +Altivec is the PowerPC SIMD instruction set. It is roughly comparable +to SSE2. Altivec was introduced to the general public in the Apple +Macintosh G4; it is also present in the G5. Altivec is actually a +Motorola trademark; Apple calls it "Velocity Engine" and IBM calls it +"VMX". All refer to the same thing. + +When built for the IA32 or PPC architectures, the functions +automatically use the most powerful SIMD instruction set available. If +no SIMD instructions are available, or if the library is built for a +non-IA32, non-PPC machine, a portable C version is executed +instead. + +.SH USAGE +Four versions of each function are provided, one for each code. +In the following discussion, change "viterbi" to "viterbi27", "viterbi29", "viterbi39" +or "viterbi615" as desired. + +Before Viterbi decoding can begin, an instance must first be created with +\fBcreate_viterbi()\fR. This function creates and returns a pointer to +an internal control structure +containing the path metrics and the branch +decisions. \fBcreate_viterbi()\fR takes one argument that gives the +length of the data block in bits. You \fImust not\fR attempt to +decode a block longer than the length given to \fBcreate_viterbi()\fR. + +Before decoding a new frame, +\fBinit_viterbi()\fR must be called to reset the decoder state. +It accepts the instance pointer returned by +\fBcreate_viterbi()\fR and the initial starting state of the +convolutional encoder (usually 0). If the initial starting state is unknown or +incorrect, the decoder will still function but the decoded data may be +incorrect at the start of the block. + +Blocks of received symbols are processed with calls to +\fBupdate_viterbi_blk()\fR. The \fBnbits\fR parameter specifies the +number of \fIdata bits\fR (not channel symbols) represented by the +\fBsyms\fR buffer. (For rate 1/2 codes, the number of symbols in +\fBsyms\fR is twice \fInbits\fR, and so on.) +Each symbol is expected to range +from 0 through 255, with 0 corresponding to a "strong 0" and 255 +corresponding to a "strong 1". The caller is responsible for +determining the proper pairing of input symbols (commonly known as +decoder symbol phasing). + +At the end of the block, the data is recovered with a call to +\fBchainback_viterbi()\fR. The arguments are the pointer to the +decoder instance, a pointer to a user-supplied buffer into which the +decoded data is to be written, the number of data bits (not bytes) +that are to be decoded, and the terminal state of the convolutional +encoder at the end of the frame (usually 0). If the terminal state is +incorrect or unknown, the decoded data bits at the end of the frame +may be unreliable. The decoded data is written in big-endian order, +i.e., the first bit in the frame is written into the high order bit of +the first byte in the buffer. If the frame is not an integral number +of bytes long, the low order bits of the last byte in the frame will +be unused. + +Note that the decoders assume the use of a tail, i.e., the encoding +and transmission of a sufficient number of padding bits beyond the end +of the user data to force the convolutional encoder into the known +terminal state given to \fBchainback_viterbi()\fR. The tail is +always one bit less than the constraint length of the code, so the k=7 +code uses 6 tail bits (12 tail symbols), the k=9 code uses 8 tail bits +(16 tail symbols) and the k=15 code uses 14 tail bits (84 tail +symbols). + +The tail bits are not included in the length arguments to +\fBcreate_viterbi()\fR and \fBchainback_viterbi()\fR. For example, if +the block contains 1000 user bits, then this would be the length +parameter given to \fBcreate_viterbi27()\fR and +\fBchainback_viterbi27()\fR, and \fBupdate_viterbi27_blk()\fR would be called +with a total of 2012 symbols - the last 12 encoded symbols +representing the tail bits. + +After the call to \fBchainback_viterbi()\fR, the decoder may be reset +with a call to \fBinit_viterbi()\fR and another block can be decoded. +Alternatively, \fBdelete_viterbi()\fR can be called to free all resources +used by the Viterbi decoder. + +The \fBset_viterbi_polynomial()\fR function allows use of other than the default +code generator polynomials. Although only one set of polynomials are generally +used with each code, there can are different conventions as to their order and +symbol polarity, and these functions simplifies their use. + +The default polynomials for the viterbi27 routes +are those of the NASA-JPL convention \fIwithout\fR symbol inversion. +The NASA-JPL convention normally inverts the first symbol. +The CCSDS/NASA-GSFC convention swaps the two symbols and inverts the second. +.sp +To set the NASA-JPL convention with symbol inversion: +.sp +.nf +.ft B +int polys[2] = { -V27POLYA,V27POLYB }; +set_viterbi27_polynomial(polys); +.ft R +.fi +.sp +and to set the CCSDS convention with symbol inversion: +.sp +.nf +.ft B +int polys[2] = { V27POLYB,-V27POLYA }; +set_viterbi27_polynomial(polys); +.ft R +.fi +.sp +The default polynomials for the viterbi615 routines +are those used by the Cassini spacecraft \fIwithout\fR +symbol inversion. Mars Pathfinder (MPF) and STEREO +swap the third and fourth polynomials. +Both conventions invert the +first, third and fifth symbols. Refer to fec.h for the polynomial constant definitions. +.sp +To set the Cassini convention with symbol inversion, do the following: + +.nf +.ft B +int polys[6] = { -V615POLYA,V615POLYB,-V615POLYC,V615POLYD,-V615POLYE,V615POLYF }; +set_viterbi615_polynomial(polys); +.ft R +.fi +.sp +and to set the MPF/STEREO convention with symbol inversion: +.sp +.nf +.ft B +int polys[6] = { -V615POLYA,V615POLYB,-V615POLYD,V615POLYC,-V615POLYE,V615POLYF }; +set_viterbi615_polynomial(polys); +.ft R +.fi + +For performance reasons, calling this function changes the code +generator polynomials for \fIall\fR instances of corresponding Viterbi decoder, +including those already created. + +.SH ERROR PERFORMANCE +These decoders have all been extensively tested and found to provide +performance consistent with that expected for soft-decision Viterbi +decoding with 8-bit symbols. + +Due to internal differences, the implementations +vary slightly in error performance. In +general, the portable C versions exhibit the best error performance +because they use full-sized branch metrics, and the MMX versions +exhibit the worst because they use 8-bit branch metrics with modulo +comparisons. The SSE, SSE2 and Altivec implementations of the r=1/2 k=7 and +r=1/2 k=9 codes use unsigned +8-bit branch metrics, and are almost as good as the C versions. The +r=1/3 k=9 and r=1/6 k=15 codes are implemented with 16-bit path metrics in all SIMD +versions. + +.SH DIRECT ACCESS TO SPECIFIC FUNCTION VERSIONS +Calling the functions listed above automatically calls the appropriate +version of the function depending on the CPU type and available SIMD +instructions. A particular version can also be called directly by +appending the appropriate suffix to the function name. The available +suffixes are "_mmx", "_sse", "_sse2", "_av" and "_port", for the MMX, +SSE, SSE2, Altivec and portable versions, respectively. For example, +the SSE2 version of the update_viterbi27_blk() function can be invoked +as update_viterbi27_blk_sse2(). + +Naturally, the _av functions are only available on the PowerPC and the +_mmx, _sse and _sse2 versions are only available on IA-32. Calling +a SIMD-enabled function on a CPU that doesn't support the appropriate +set of instructions will result in an illegal instruction exception. + +.SH RETURN VALUES +\fBcreate_viterbi\fR returns a pointer to the structure containing +the decoder state. +The other functions return -1 on error, 0 otherwise. + +.SH AUTHOR & COPYRIGHT +Phil Karn, KA9Q (karn@ka9q.net) + +.SH LICENSE +This software may be used under the terms of the GNU Limited General Public License (LGPL). + + diff --git a/libfec/sqtest.c b/libfec/sqtest.c new file mode 100644 index 0000000..b2abb09 --- /dev/null +++ b/libfec/sqtest.c @@ -0,0 +1,42 @@ +/* Verify correctness of the sum-of-square routines */ +#include +#include +#include + +/* These values should trigger leading/trailing array fragment handling */ +#define NSAMP 200002 +#define OFFSET 1 + +long long sumsq_wq(signed short *in,int cnt); +long long sumsq_wq_ref(signed short *in,int cnt); + +int main(){ + int i; + long long result,rresult; + signed short samples[NSAMP]; + + srandom(time(NULL)); + + for(i=0;i old metrics + movq NEWMETRICS(%rdx),%rdi # edi -> new metrics + movq DP(%rdx),%rdx # edx -> decisions + +1: movq 16(%rbp),%rax # eax = nbits + decq %rax + jl 2f # passed zero, we're done + movq %rax,16(%rbp) + + xorq %rax,%rax + movq 12(%rbp),%rbx # ebx = syms + movb (%rbx),%al + movd %rax,%xmm6 # xmm6[0] = first symbol + movb 1(%rbx),%al + movd %rax,%xmm5 # xmm5[0] = second symbol + addq $2,%rbx + movq %rbx,12(%rbp) + + punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] + punpcklbw %xmm5,%xmm5 + pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 + pshuflw $0,%xmm5,%xmm5 + punpcklqdq %xmm6,%xmm6 # propagate to all 16 + punpcklqdq %xmm5,%xmm5 + # xmm6 now contains first symbol in each byte, xmm5 the second + + movdqa thirtyones(%rip),%xmm7 + + # each invocation of this macro does 16 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movdqa (Branchtab27_sse2+(16*\GROUP))(%rip),%xmm4 + movdqa (Branchtab27_sse2+32+(16*\GROUP))(%rip),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + + # compute 5-bit branch metric in xmm4 by adding the individual symbol metrics + # This is okay for this + # code because the worst-case metric spread (at high Eb/No) is only 120, + # well within the range of our unsigned 8-bit path metrics, and even within + # the range of signed 8-bit path metrics + pavgb %xmm3,%xmm4 + psrlw $3,%xmm4 + + pand %xmm7,%xmm4 + + movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 + movdqa ((16*\GROUP)+32)(%esi),%xmm3 # Incoming path metric, high bit = 1 + movdqa %xmm0,%xmm2 + movdqa %xmm3,%xmm1 + paddusb %xmm4,%xmm0 # note use of saturating arithmetic + paddusb %xmm4,%xmm3 # this shouldn't be necessary, but why not? + + # negate branch metrics + pxor %xmm7,%xmm4 + paddusb %xmm4,%xmm1 + paddusb %xmm4,%xmm2 + + # Find survivors, leave in mm0,2 + pminub %xmm1,%xmm0 + pminub %xmm3,%xmm2 + # get decisions, leave in mm1,3 + pcmpeqb %xmm0,%xmm1 + pcmpeqb %xmm2,%xmm3 + + # interleave and store new branch metrics in mm0,2 + movdqa %xmm0,%xmm4 + punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics + punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics + movdqa %xmm0,(32*\GROUP+16)(%rdi) + movdqa %xmm4,(32*\GROUP)(%rdi) + + # interleave decisions & store + movdqa %xmm1,%xmm4 + punpckhbw %xmm3,%xmm1 + punpcklbw %xmm3,%xmm4 + # work around bug in gas due to Intel doc error + .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx + shlq $16,%rbx + .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax + orq %rax,%rbx + movq %rbx,(4*\GROUP)(%rdx) + .endm + + # invoke macro 2 times for a total of 32 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + + addq $8,%rdx # bump decision pointer + + # See if we have to normalize. This requires an explanation. We don't want + # our path metrics to exceed 255 on the *next* iteration. Since the + # largest branch metric is 30, that means we don't want any to exceed 225 + # on *this* iteration. Rather than look them all, we just pick an arbitrary one + # (the first) and see if it exceeds 225-120=105, where 120 is the experimentally- + # determined worst-case metric spread for this code and branch metrics in the range 0-30. + + # This is extremely conservative, and empirical testing at a variety of Eb/Nos might + # show that a higher threshold could be used without affecting BER performance + movq (%rdi),%rax # extract first output metric + andq $255,%rax + cmp $105,%rax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics. We can't just pick an arbitrary small constant because + # the minimum metric might be zero! + movdqa (%rdi),%xmm0 + movdqa %xmm0,%xmm4 + movdqa 16(%rdi),%xmm1 + pminub %xmm1,%xmm4 + movdqa 32(%rdi),%xmm2 + pminub %xmm2,%xmm4 + movdqa 48(%rdi),%xmm3 + pminub %xmm3,%xmm4 + + # crunch down to single lowest metric + movdqa %xmm4,%xmm5 + psrldq $8,%xmm5 # the count to psrldq is bytes, not bits! + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $32,%xmm5 + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $16,%xmm5 + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $8,%xmm5 + pminub %xmm5,%xmm4 # now in lowest byte of %xmm4 + + punpcklbw %xmm4,%xmm4 # lowest 2 bytes + pshuflw $0,%xmm4,%xmm4 # lowest 8 bytes + punpcklqdq %xmm4,%xmm4 # all 16 bytes + + # xmm4 now contains lowest metric in all 16 bytes + # subtract it from every output metric + psubusb %xmm4,%xmm0 + psubusb %xmm4,%xmm1 + psubusb %xmm4,%xmm2 + psubusb %xmm4,%xmm3 + movdqa %xmm0,(%rdi) + movdqa %xmm1,16(%rdi) + movdqa %xmm2,32(%rdi) + movdqa %xmm3,48(%rdi) + +done: + # swap metrics + movq %rsi,%rax + movq %rdi,%rsi + movq %rax,%rdi + jmp 1b + +2: movq 8(%rbp),%rbx # ebx = vp + # stash metric pointers + movq %rsi,OLDMETRICS(%rbx) + movq %rdi,NEWMETRICS(%rbx) + movq %rdx,DP(%rbx) # stash incremented value of vp->dp + xorq %rax,%rax +err: popq %rbx + popq %rdx + popq %rdi + popq %rsi + popq %rbp + ret + + .data + .align 16 + +thirtyones: + .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 diff --git a/libfec/sse2bfly27.s b/libfec/sse2bfly27.s new file mode 100644 index 0000000..27422a2 --- /dev/null +++ b/libfec/sse2bfly27.s @@ -0,0 +1,202 @@ +/* Intel SIMD (SSE2) implementations of Viterbi ACS butterflies + for 64-state (k=7) convolutional code + Copyright 2003 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi27_blk_sse2(struct v27 *vp,unsigned char syms[],int nbits) ; +*/ + # SSE2 (128-bit integer SIMD) version + # Requires Pentium 4 or better + + # These are offsets into struct v27, defined in viterbi27.h + .set DP,128 + .set OLDMETRICS,132 + .set NEWMETRICS,136 + .text + .global update_viterbi27_blk_sse2,Branchtab27_sse2 + .type update_viterbi27_blk_sse2,@function + .align 16 + +update_viterbi27_blk_sse2: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # ebx = syms + movb (%ebx),%al + movd %eax,%xmm6 # xmm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%xmm5 # xmm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] + punpcklbw %xmm5,%xmm5 + pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 + pshuflw $0,%xmm5,%xmm5 + punpcklqdq %xmm6,%xmm6 # propagate to all 16 + punpcklqdq %xmm5,%xmm5 + # xmm6 now contains first symbol in each byte, xmm5 the second + + movdqa thirtyones,%xmm7 + + # each invocation of this macro does 16 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movdqa Branchtab27_sse2+(16*\GROUP),%xmm4 + movdqa Branchtab27_sse2+32+(16*\GROUP),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + + # compute 5-bit branch metric in xmm4 by adding the individual symbol metrics + # This is okay for this + # code because the worst-case metric spread (at high Eb/No) is only 120, + # well within the range of our unsigned 8-bit path metrics, and even within + # the range of signed 8-bit path metrics + pavgb %xmm3,%xmm4 + psrlw $3,%xmm4 + + pand %xmm7,%xmm4 + + movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 + movdqa ((16*\GROUP)+32)(%esi),%xmm3 # Incoming path metric, high bit = 1 + movdqa %xmm0,%xmm2 + movdqa %xmm3,%xmm1 + paddusb %xmm4,%xmm0 # note use of saturating arithmetic + paddusb %xmm4,%xmm3 # this shouldn't be necessary, but why not? + + # negate branch metrics + pxor %xmm7,%xmm4 + paddusb %xmm4,%xmm1 + paddusb %xmm4,%xmm2 + + # Find survivors, leave in mm0,2 + pminub %xmm1,%xmm0 + pminub %xmm3,%xmm2 + # get decisions, leave in mm1,3 + pcmpeqb %xmm0,%xmm1 + pcmpeqb %xmm2,%xmm3 + + # interleave and store new branch metrics in mm0,2 + movdqa %xmm0,%xmm4 + punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics + punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics + movdqa %xmm0,(32*\GROUP+16)(%edi) + movdqa %xmm4,(32*\GROUP)(%edi) + + # interleave decisions & store + movdqa %xmm1,%xmm4 + punpckhbw %xmm3,%xmm1 + punpcklbw %xmm3,%xmm4 + # work around bug in gas due to Intel doc error + .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx + shll $16,%ebx + .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax + orl %eax,%ebx + movl %ebx,(4*\GROUP)(%edx) + .endm + + # invoke macro 2 times for a total of 32 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + + addl $8,%edx # bump decision pointer + + # See if we have to normalize. This requires an explanation. We don't want + # our path metrics to exceed 255 on the *next* iteration. Since the + # largest branch metric is 30, that means we don't want any to exceed 225 + # on *this* iteration. Rather than look them all, we just pick an arbitrary one + # (the first) and see if it exceeds 225-120=105, where 120 is the experimentally- + # determined worst-case metric spread for this code and branch metrics in the range 0-30. + + # This is extremely conservative, and empirical testing at a variety of Eb/Nos might + # show that a higher threshold could be used without affecting BER performance + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmp $105,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics. We can't just pick an arbitrary small constant because + # the minimum metric might be zero! + movdqa (%edi),%xmm0 + movdqa %xmm0,%xmm4 + movdqa 16(%edi),%xmm1 + pminub %xmm1,%xmm4 + movdqa 32(%edi),%xmm2 + pminub %xmm2,%xmm4 + movdqa 48(%edi),%xmm3 + pminub %xmm3,%xmm4 + + # crunch down to single lowest metric + movdqa %xmm4,%xmm5 + psrldq $8,%xmm5 # the count to psrldq is bytes, not bits! + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $32,%xmm5 + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $16,%xmm5 + pminub %xmm5,%xmm4 + movdqa %xmm4,%xmm5 + psrlq $8,%xmm5 + pminub %xmm5,%xmm4 # now in lowest byte of %xmm4 + + punpcklbw %xmm4,%xmm4 # lowest 2 bytes + pshuflw $0,%xmm4,%xmm4 # lowest 8 bytes + punpcklqdq %xmm4,%xmm4 # all 16 bytes + + # xmm4 now contains lowest metric in all 16 bytes + # subtract it from every output metric + psubusb %xmm4,%xmm0 + psubusb %xmm4,%xmm1 + psubusb %xmm4,%xmm2 + psubusb %xmm4,%xmm3 + movdqa %xmm0,(%edi) + movdqa %xmm1,16(%edi) + movdqa %xmm2,32(%edi) + movdqa %xmm3,48(%edi) + +done: + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 16 + +thirtyones: + .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 diff --git a/libfec/sse2bfly29-64.s b/libfec/sse2bfly29-64.s new file mode 100644 index 0000000..22bd8a1 --- /dev/null +++ b/libfec/sse2bfly29-64.s @@ -0,0 +1,254 @@ +/* Intel SIMD SSE2 implementation of Viterbi ACS butterflies + for 256-state (k=9) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + Modifications for x86_64, 2012 Matthias P. Braendli, HB9EGM + - changed registers to x86-64 equivalents + - changed instructions accordingly + - %rip indirect addressing needed for position independent code, + which is required because x86-64 needs dynamic libs to be PIC. + That still doesn't work + + void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ; +*/ + # SSE2 (128-bit integer SIMD) version + # All X86-64 CPUs include SSE2 + + # These are offsets into struct v29, defined in viterbi29_av.c + .set DP,512 + .set OLDMETRICS,516 + .set NEWMETRICS,520 + + .text + .global update_viterbi29_blk_sse2,Branchtab29_sse2 + .type update_viterbi29_blk_sse2,@function + .align 16 + +update_viterbi29_blk_sse2: + pushq %rbp + movq %rsp,%rbp + /* convention different between i386 and x86_64: rsi and rdi belong to called function, not caller */ + /* Let's say we don't care (yet) */ + pushq %rsi + pushq %rdi + pushq %rdx + pushq %rbx + + movq 8(%rbp),%rdx # edx = vp + testq %rdx,%rdx + jnz 0f + movq -1,%rax + jmp err +0: movq OLDMETRICS(%rdx),%rsi # esi -> old metrics + movq NEWMETRICS(%rdx),%rdi # edi -> new metrics + movq DP(%rdx),%rdx # edx -> decisions + +1: movq 16(%rbp),%rax # eax = nbits + decq %rax + jl 2f # passed zero, we're done + movq %rax,16(%rbp) + + xorq %rax,%rax + movq 12(%rbp),%rbx # ebx = syms + movb (%rbx),%al + movd %rax,%xmm6 # xmm6[0] = first symbol + movb 1(%rbx),%al + movd %rax,%xmm5 # xmm5[0] = second symbol + addq $2,%rbx + movq %rbx,12(%rbp) + + punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] + punpcklbw %xmm5,%xmm5 + movdqa thirtyones(%rip),%xmm7 + pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 + pshuflw $0,%xmm5,%xmm5 + punpcklqdq %xmm6,%xmm6 # propagate to all 16 + punpcklqdq %xmm5,%xmm5 + # xmm6 now contains first symbol in each byte, xmm5 the second + + movdqa thirtyones(%rip),%xmm7 + + # each invocation of this macro does 16 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movdqa Branchtab29_sse2+(16*\GROUP)(%rip),%xmm4 + movdqa Branchtab29_sse2+128+(16*\GROUP)(%rip),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + pavgb %xmm3,%xmm4 + psrlw $3,%xmm4 + + pand %xmm7,%xmm4 # xmm4 contains branch metrics + + movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 + movdqa ((16*\GROUP)+128)(%esi),%xmm3 # Incoming path metric, high bit = 1 + movdqa %xmm0,%xmm2 + movdqa %xmm3,%xmm1 + paddusb %xmm4,%xmm0 + paddusb %xmm4,%xmm3 + + # invert branch metrics + pxor %xmm7,%xmm4 + + paddusb %xmm4,%xmm1 + paddusb %xmm4,%xmm2 + + # Find survivors, leave in mm0,2 + pminub %xmm1,%xmm0 + pminub %xmm3,%xmm2 + # get decisions, leave in mm1,3 + pcmpeqb %xmm0,%xmm1 + pcmpeqb %xmm2,%xmm3 + + # interleave and store new branch metrics in mm0,2 + movdqa %xmm0,%xmm4 + punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics + punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics + movdqa %xmm0,(32*\GROUP+16)(%rdi) + movdqa %xmm4,(32*\GROUP)(%rdi) + + # interleave decisions & store + movdqa %xmm1,%xmm4 + punpckhbw %xmm3,%xmm1 + punpcklbw %xmm3,%xmm4 + # work around bug in gas due to Intel doc error + .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx + shlq $16,%rbx + .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax + orq %rax,%rbx + movq %rbx,(4*\GROUP)(%rdx) + .endm + + # invoke macro 8 times for a total of 128 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + butterfly GROUP=2 + butterfly GROUP=3 + butterfly GROUP=4 + butterfly GROUP=5 + butterfly GROUP=6 + butterfly GROUP=7 + + addq $32,%rdx # bump decision pointer + + # see if we have to normalize + movq (%rdi),%rax # extract first output metric + andq $255,%rax + cmp $50,%rax # is it greater than 50? + movq $0,%rax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movdqa (%rdi),%xmm0 + pminub 16(%rdi),%xmm0 + pminub 32(%rdi),%xmm0 + pminub 48(%rdi),%xmm0 + pminub 64(%rdi),%xmm0 + pminub 80(%rdi),%xmm0 + pminub 96(%rdi),%xmm0 + pminub 112(%rdi),%xmm0 + pminub 128(%rdi),%xmm0 + pminub 144(%rdi),%xmm0 + pminub 160(%rdi),%xmm0 + pminub 176(%rdi),%xmm0 + pminub 192(%rdi),%xmm0 + pminub 208(%rdi),%xmm0 + pminub 224(%rdi),%xmm0 + pminub 240(%rdi),%xmm0 + + # crunch down to single lowest metric + movdqa %xmm0,%xmm1 + psrldq $8,%xmm0 # the count to psrldq is bytes, not bits! + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $32,%xmm0 + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $16,%xmm0 + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $8,%xmm0 + pminub %xmm1,%xmm0 + + punpcklbw %xmm0,%xmm0 # lowest 2 bytes + pshuflw $0,%xmm0,%xmm0 # lowest 8 bytes + punpcklqdq %xmm0,%xmm0 # all 16 bytes + + # xmm0 now contains lowest metric in all 16 bytes + # subtract it from every output metric + movdqa (%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,(%rdi) + movdqa 16(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,16(%rdi) + movdqa 32(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,32(%rdi) + movdqa 48(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,48(%rdi) + movdqa 64(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,64(%rdi) + movdqa 80(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,80(%rdi) + movdqa 96(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,96(%rdi) + movdqa 112(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,112(%rdi) + movdqa 128(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,128(%rdi) + movdqa 144(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,144(%rdi) + movdqa 160(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,160(%rdi) + movdqa 176(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,176(%rdi) + movdqa 192(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,192(%rdi) + movdqa 208(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,208(%rdi) + movdqa 224(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,224(%rdi) + movdqa 240(%rdi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,240(%rdi) + +done: + # swap metrics + movq %rsi,%rax + movq %rdi,%rsi + movq %rax,%rdi + jmp 1b + +2: movq 8(%rbp),%rbx # ebx = vp + # stash metric pointers + movq %rsi,OLDMETRICS(%rbx) + movq %rdi,NEWMETRICS(%rbx) + movq %rdx,DP(%rbx) # stash incremented value of vp->dp + xorq %rax,%rax +err: popq %rbx + popq %rdx + popq %rdi + popq %rsi + popq %rbp + ret + + .data + .align 16 +thirtyones: + .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 + diff --git a/libfec/sse2bfly29.s b/libfec/sse2bfly29.s new file mode 100644 index 0000000..0fa1742 --- /dev/null +++ b/libfec/sse2bfly29.s @@ -0,0 +1,245 @@ +/* Intel SIMD SSE2 implementation of Viterbi ACS butterflies + for 256-state (k=9) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ; +*/ + + # SSE2 (128-bit integer SIMD) version + # Requires Pentium 4 or better + # These are offsets into struct v29, defined in viterbi29.h + .set DP,512 + .set OLDMETRICS,516 + .set NEWMETRICS,520 + + .text + .global update_viterbi29_blk_sse2,Branchtab29_sse2 + .type update_viterbi29_blk_sse2,@function + .align 16 + +update_viterbi29_blk_sse2: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # ebx = syms + movb (%ebx),%al + movd %eax,%xmm6 # xmm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%xmm5 # xmm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] + punpcklbw %xmm5,%xmm5 + movdqa thirtyones,%xmm7 + pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 + pshuflw $0,%xmm5,%xmm5 + punpcklqdq %xmm6,%xmm6 # propagate to all 16 + punpcklqdq %xmm5,%xmm5 + # xmm6 now contains first symbol in each byte, xmm5 the second + + movdqa thirtyones,%xmm7 + + # each invocation of this macro does 16 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movdqa Branchtab29_sse2+(16*\GROUP),%xmm4 + movdqa Branchtab29_sse2+128+(16*\GROUP),%xmm3 + pxor %xmm6,%xmm4 + pxor %xmm5,%xmm3 + pavgb %xmm3,%xmm4 + psrlw $3,%xmm4 + + pand %xmm7,%xmm4 # xmm4 contains branch metrics + + movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 + movdqa ((16*\GROUP)+128)(%esi),%xmm3 # Incoming path metric, high bit = 1 + movdqa %xmm0,%xmm2 + movdqa %xmm3,%xmm1 + paddusb %xmm4,%xmm0 + paddusb %xmm4,%xmm3 + + # invert branch metrics + pxor %xmm7,%xmm4 + + paddusb %xmm4,%xmm1 + paddusb %xmm4,%xmm2 + + # Find survivors, leave in mm0,2 + pminub %xmm1,%xmm0 + pminub %xmm3,%xmm2 + # get decisions, leave in mm1,3 + pcmpeqb %xmm0,%xmm1 + pcmpeqb %xmm2,%xmm3 + + # interleave and store new branch metrics in mm0,2 + movdqa %xmm0,%xmm4 + punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics + punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics + movdqa %xmm0,(32*\GROUP+16)(%edi) + movdqa %xmm4,(32*\GROUP)(%edi) + + # interleave decisions & store + movdqa %xmm1,%xmm4 + punpckhbw %xmm3,%xmm1 + punpcklbw %xmm3,%xmm4 + # work around bug in gas due to Intel doc error + .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx + shll $16,%ebx + .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax + orl %eax,%ebx + movl %ebx,(4*\GROUP)(%edx) + .endm + + # invoke macro 8 times for a total of 128 butterflies + butterfly GROUP=0 + butterfly GROUP=1 + butterfly GROUP=2 + butterfly GROUP=3 + butterfly GROUP=4 + butterfly GROUP=5 + butterfly GROUP=6 + butterfly GROUP=7 + + addl $32,%edx # bump decision pointer + + # see if we have to normalize + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmp $50,%eax # is it greater than 50? + movl $0,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movdqa (%edi),%xmm0 + pminub 16(%edi),%xmm0 + pminub 32(%edi),%xmm0 + pminub 48(%edi),%xmm0 + pminub 64(%edi),%xmm0 + pminub 80(%edi),%xmm0 + pminub 96(%edi),%xmm0 + pminub 112(%edi),%xmm0 + pminub 128(%edi),%xmm0 + pminub 144(%edi),%xmm0 + pminub 160(%edi),%xmm0 + pminub 176(%edi),%xmm0 + pminub 192(%edi),%xmm0 + pminub 208(%edi),%xmm0 + pminub 224(%edi),%xmm0 + pminub 240(%edi),%xmm0 + + # crunch down to single lowest metric + movdqa %xmm0,%xmm1 + psrldq $8,%xmm0 # the count to psrldq is bytes, not bits! + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $32,%xmm0 + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $16,%xmm0 + pminub %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + psrlq $8,%xmm0 + pminub %xmm1,%xmm0 + + punpcklbw %xmm0,%xmm0 # lowest 2 bytes + pshuflw $0,%xmm0,%xmm0 # lowest 8 bytes + punpcklqdq %xmm0,%xmm0 # all 16 bytes + + # xmm0 now contains lowest metric in all 16 bytes + # subtract it from every output metric + movdqa (%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,(%edi) + movdqa 16(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,16(%edi) + movdqa 32(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,32(%edi) + movdqa 48(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,48(%edi) + movdqa 64(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,64(%edi) + movdqa 80(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,80(%edi) + movdqa 96(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,96(%edi) + movdqa 112(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,112(%edi) + movdqa 128(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,128(%edi) + movdqa 144(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,144(%edi) + movdqa 160(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,160(%edi) + movdqa 176(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,176(%edi) + movdqa 192(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,192(%edi) + movdqa 208(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,208(%edi) + movdqa 224(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,224(%edi) + movdqa 240(%edi),%xmm1 + psubusb %xmm0,%xmm1 + movdqa %xmm1,240(%edi) + +done: + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 16 +thirtyones: + .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 + diff --git a/libfec/ssebfly27.s b/libfec/ssebfly27.s new file mode 100644 index 0000000..7f445da --- /dev/null +++ b/libfec/ssebfly27.s @@ -0,0 +1,205 @@ +/* Intel SIMD (SSE) implementation of Viterbi ACS butterflies + for 64-state (k=7) convolutional code + Copyright 2001 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ; +*/ + + # SSE (64-bit integer SIMD) version + # Requires Pentium III or better + + # These are offsets into struct v27, defined in viterbi27.h + .set DP,128 + .set OLDMETRICS,132 + .set NEWMETRICS,136 +.text +.global update_viterbi27_blk_sse,Branchtab27_sse + .type update_viterbi27_blk_sse,@function + .align 16 + +update_viterbi27_blk_sse: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # %ebx = syms + movb (%ebx),%al + movd %eax,%mm6 # mm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%mm5 # mm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %mm6,%mm6 # mm6[1] = mm6[0] + punpcklbw %mm5,%mm5 + movq thirtyones,%mm7 + + pshufw $0,%mm6,%mm6 # copy low word to upper 3 + pshufw $0,%mm5,%mm5 + # mm6 now contains first symbol in each byte, mm5 the second + + # each invocation of this macro does 8 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movq Branchtab27_sse+(8*\GROUP),%mm4 + movq Branchtab27_sse+32+(8*\GROUP),%mm3 + pxor %mm6,%mm4 + pxor %mm5,%mm3 + pavgb %mm3,%mm4 # mm4 contains branch metrics + psrlw $3,%mm4 + pand %mm7,%mm4 + + movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+32)(%esi),%mm3 # Incoming path metric, high bit = 1 + movq %mm0,%mm2 + movq %mm3,%mm1 + paddusb %mm4,%mm0 + paddusb %mm4,%mm3 + + # invert branch metrics. This works only because they're 5 bits + pxor %mm7,%mm4 + + paddusb %mm4,%mm1 + paddusb %mm4,%mm2 + + # Find survivors, leave in mm0,2 + pminub %mm1,%mm0 + pminub %mm3,%mm2 + # get decisions, leave in mm1,3 + pcmpeqb %mm0,%mm1 + pcmpeqb %mm2,%mm3 + + # interleave and store new branch metrics in mm0,2 + movq %mm0,%mm4 + punpckhbw %mm2,%mm0 # interleave second 8 new metrics + punpcklbw %mm2,%mm4 # interleave first 8 new metrics + movq %mm0,(16*\GROUP+8)(%edi) + movq %mm4,(16*\GROUP)(%edi) + + # interleave decisions, accumulate into %ebx + movq %mm1,%mm4 + punpckhbw %mm3,%mm1 + punpcklbw %mm3,%mm4 + # Due to an error in the Intel instruction set ref (the register + # fields are swapped), gas assembles pmovmskb incorrectly + # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html + .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax + shll $((16*\GROUP+8)&31),%eax + orl %eax,%ebx + .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax + shll $((16*\GROUP)&31),%eax + orl %eax,%ebx + .endm + + # invoke macro 4 times for a total of 32 butterflies + xorl %ebx,%ebx # clear decisions + butterfly GROUP=0 + butterfly GROUP=1 + movl %ebx,(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=2 + butterfly GROUP=3 + movl %ebx,4(%edx) # stash second 32 decisions + + addl $8,%edx # bump decision pointer + + # see if we have to normalize + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmpl $150,%eax # is it greater than 150? + movl $0,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movq (%edi),%mm0 + pminub 8(%edi),%mm0 + pminub 16(%edi),%mm0 + pminub 24(%edi),%mm0 + pminub 32(%edi),%mm0 + pminub 40(%edi),%mm0 + pminub 48(%edi),%mm0 + pminub 56(%edi),%mm0 + # mm0 contains 8 smallest metrics + # crunch down to single lowest metric + movq %mm0,%mm1 + psrlq $32,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $16,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $8,%mm0 + pminub %mm1,%mm0 + punpcklbw %mm0,%mm0 # expand to all 8 bytes + pshufw $0,%mm0,%mm0 + + # mm0 now contains lowest metric in all 8 bytes + # subtract it from every output metric + # Trashes %mm7 + .macro PSUBUSBM REG,MEM + movq \MEM,%mm7 + psubusb \REG,%mm7 + movq %mm7,\MEM + .endm + + PSUBUSBM %mm0,(%edi) + PSUBUSBM %mm0,8(%edi) + PSUBUSBM %mm0,16(%edi) + PSUBUSBM %mm0,24(%edi) + PSUBUSBM %mm0,32(%edi) + PSUBUSBM %mm0,40(%edi) + PSUBUSBM %mm0,48(%edi) + PSUBUSBM %mm0,56(%edi) + + movd %mm0,%eax + and $0xff,%eax + +done: # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + + ret + + .data + + .align 16 +thirtyones: + .byte 31,31,31,31,31,31,31,31 + + + diff --git a/libfec/ssebfly29.s b/libfec/ssebfly29.s new file mode 100644 index 0000000..d7d2149 --- /dev/null +++ b/libfec/ssebfly29.s @@ -0,0 +1,271 @@ +/* Intel SIMD SSE implementation of Viterbi ACS butterflies + for 256-state (k=9) convolutional code + Copyright 2004 Phil Karn, KA9Q + This code may be used under the terms of the GNU Lesser General Public License (LGPL) + + void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits); +*/ + # SSE (64-bit integer SIMD) version + # Requires Pentium III or better + # These are offsets into struct v29, defined in viterbi29.h + .set DP,512 + .set OLDMETRICS,516 + .set NEWMETRICS,520 + .text + .global update_viterbi29_blk_sse,Branchtab29_sse + .type update_viterbi29_blk_sse,@function + .align 16 + +update_viterbi29_blk_sse: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %edx + pushl %ebx + + movl 8(%ebp),%edx # edx = vp + testl %edx,%edx + jnz 0f + movl -1,%eax + jmp err +0: movl OLDMETRICS(%edx),%esi # esi -> old metrics + movl NEWMETRICS(%edx),%edi # edi -> new metrics + movl DP(%edx),%edx # edx -> decisions + +1: movl 16(%ebp),%eax # eax = nbits + decl %eax + jl 2f # passed zero, we're done + movl %eax,16(%ebp) + + xorl %eax,%eax + movl 12(%ebp),%ebx # ebx = syms + movb (%ebx),%al + movd %eax,%mm6 # mm6[0] = first symbol + movb 1(%ebx),%al + movd %eax,%mm5 # mm5[0] = second symbol + addl $2,%ebx + movl %ebx,12(%ebp) + + punpcklbw %mm6,%mm6 # mm6[1] = mm6[0] + punpcklbw %mm5,%mm5 + + movq thirtyones,%mm7 + pshufw $0,%mm6,%mm6 # copy low word to upper 3 + pshufw $0,%mm5,%mm5 + # mm6 now contains first symbol in each byte, mm5 the second + + # each invocation of this macro does 8 butterflies in parallel + .MACRO butterfly GROUP + # compute branch metrics + movq Branchtab29_sse+(8*\GROUP),%mm4 + movq Branchtab29_sse+128+(8*\GROUP),%mm3 + pxor %mm6,%mm4 + pxor %mm5,%mm3 + pavgb %mm3,%mm4 # mm4 contains branch metrics + psrlw $3,%mm4 + pand %mm7,%mm4 + + movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0 + movq ((8*\GROUP)+128)(%esi),%mm3 # Incoming path metric, high bit = 1 + movq %mm0,%mm2 + movq %mm3,%mm1 + paddusb %mm4,%mm0 + paddusb %mm4,%mm3 + + # invert branch metrics. This works only because they're 5 bits + pxor %mm7,%mm4 + + paddusb %mm4,%mm1 + paddusb %mm4,%mm2 + + # Find survivors, leave in mm0,2 + pminub %mm1,%mm0 + pminub %mm3,%mm2 + # get decisions, leave in mm1,3 + pcmpeqb %mm0,%mm1 + pcmpeqb %mm2,%mm3 + + # interleave and store new branch metrics in mm0,2 + movq %mm0,%mm4 + punpckhbw %mm2,%mm0 # interleave second 8 new metrics + punpcklbw %mm2,%mm4 # interleave first 8 new metrics + movq %mm0,(16*\GROUP+8)(%edi) + movq %mm4,(16*\GROUP)(%edi) + + # interleave decisions, accumulate into %ebx + movq %mm1,%mm4 + punpckhbw %mm3,%mm1 + punpcklbw %mm3,%mm4 + # Due to an error in the Intel instruction set ref (the register + # fields are swapped), gas assembles pmovmskb incorrectly + # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html + .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax + shll $((16*\GROUP+8)&31),%eax + orl %eax,%ebx + .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax + shll $((16*\GROUP)&31),%eax + orl %eax,%ebx + .endm + + # invoke macro 16 times for a total of 128 butterflies + xorl %ebx,%ebx # clear decisions + butterfly GROUP=0 + butterfly GROUP=1 + movl %ebx,(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=2 + butterfly GROUP=3 + movl %ebx,4(%edx) # stash second 32 decisions + xorl %ebx,%ebx # clear decisions + butterfly GROUP=4 + butterfly GROUP=5 + movl %ebx,8(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=6 + butterfly GROUP=7 + movl %ebx,12(%edx) # stash second 32 decisions + xorl %ebx,%ebx # clear decisions + butterfly GROUP=8 + butterfly GROUP=9 + movl %ebx,16(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=10 + butterfly GROUP=11 + movl %ebx,20(%edx) # stash second 32 decisions + xorl %ebx,%ebx # clear decisions + butterfly GROUP=12 + butterfly GROUP=13 + movl %ebx,24(%edx) # stash first 32 decisions + xorl %ebx,%ebx + butterfly GROUP=14 + butterfly GROUP=15 + movl %ebx,28(%edx) # stash second 32 decisions + + addl $32,%edx # bump decision pointer + + # see if we have to normalize + movl (%edi),%eax # extract first output metric + andl $255,%eax + cmp $50,%eax # is it greater than 50? + movl $0,%eax + jle done # No, no need to normalize + + # Normalize by finding smallest metric and subtracting it + # from all metrics + movq (%edi),%mm0 + pminub 8(%edi),%mm0 + pminub 16(%edi),%mm0 + pminub 24(%edi),%mm0 + pminub 32(%edi),%mm0 + pminub 40(%edi),%mm0 + pminub 48(%edi),%mm0 + pminub 56(%edi),%mm0 + pminub 64(%edi),%mm0 + pminub 72(%edi),%mm0 + pminub 80(%edi),%mm0 + pminub 88(%edi),%mm0 + pminub 96(%edi),%mm0 + pminub 104(%edi),%mm0 + pminub 112(%edi),%mm0 + pminub 120(%edi),%mm0 + pminub 128(%edi),%mm0 + pminub 136(%edi),%mm0 + pminub 144(%edi),%mm0 + pminub 152(%edi),%mm0 + pminub 160(%edi),%mm0 + pminub 168(%edi),%mm0 + pminub 176(%edi),%mm0 + pminub 184(%edi),%mm0 + pminub 192(%edi),%mm0 + pminub 200(%edi),%mm0 + pminub 208(%edi),%mm0 + pminub 216(%edi),%mm0 + pminub 224(%edi),%mm0 + pminub 232(%edi),%mm0 + pminub 240(%edi),%mm0 + pminub 248(%edi),%mm0 + # mm0 contains 8 smallest metrics + # crunch down to single lowest metric + movq %mm0,%mm1 + psrlq $32,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $16,%mm0 + pminub %mm1,%mm0 + movq %mm0,%mm1 + psrlq $8,%mm0 + pminub %mm1,%mm0 + movq 8(%edi),%mm1 # reload + punpcklbw %mm0,%mm0 # expand to all 8 bytes + pshufw $0,%mm0,%mm0 + + # mm0 now contains lowest metric in all 8 bytes + # subtract it from every output metric + # Trashes %mm7 + .macro PSUBUSBM REG,MEM + movq \MEM,%mm7 + psubusb \REG,%mm7 + movq %mm7,\MEM + .endm + + PSUBUSBM %mm0,(%edi) + PSUBUSBM %mm0,8(%edi) + PSUBUSBM %mm0,16(%edi) + PSUBUSBM %mm0,24(%edi) + PSUBUSBM %mm0,32(%edi) + PSUBUSBM %mm0,40(%edi) + PSUBUSBM %mm0,48(%edi) + PSUBUSBM %mm0,56(%edi) + PSUBUSBM %mm0,64(%edi) + PSUBUSBM %mm0,72(%edi) + PSUBUSBM %mm0,80(%edi) + PSUBUSBM %mm0,88(%edi) + PSUBUSBM %mm0,96(%edi) + PSUBUSBM %mm0,104(%edi) + PSUBUSBM %mm0,112(%edi) + PSUBUSBM %mm0,120(%edi) + PSUBUSBM %mm0,128(%edi) + PSUBUSBM %mm0,136(%edi) + PSUBUSBM %mm0,144(%edi) + PSUBUSBM %mm0,152(%edi) + PSUBUSBM %mm0,160(%edi) + PSUBUSBM %mm0,168(%edi) + PSUBUSBM %mm0,176(%edi) + PSUBUSBM %mm0,184(%edi) + PSUBUSBM %mm0,192(%edi) + PSUBUSBM %mm0,200(%edi) + PSUBUSBM %mm0,208(%edi) + PSUBUSBM %mm0,216(%edi) + PSUBUSBM %mm0,224(%edi) + PSUBUSBM %mm0,232(%edi) + PSUBUSBM %mm0,240(%edi) + PSUBUSBM %mm0,248(%edi) + +done: + # swap metrics + movl %esi,%eax + movl %edi,%esi + movl %eax,%edi + jmp 1b + +2: emms + movl 8(%ebp),%ebx # ebx = vp + # stash metric pointers + movl %esi,OLDMETRICS(%ebx) + movl %edi,NEWMETRICS(%ebx) + movl %edx,DP(%ebx) # stash incremented value of vp->dp + xorl %eax,%eax +err: popl %ebx + popl %edx + popl %edi + popl %esi + popl %ebp + ret + + .data + .align 8 +thirtyones: + .byte 31,31,31,31,31,31,31,31 + + diff --git a/libfec/sumsq.c b/libfec/sumsq.c new file mode 100644 index 0000000..e567c89 --- /dev/null +++ b/libfec/sumsq.c @@ -0,0 +1,50 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include +#include "fec.h" + +unsigned long long sumsq_port(signed short *,int); + +#ifdef __i386__ +unsigned long long sumsq_mmx(signed short *,int); +unsigned long long sumsq_sse(signed short *,int); +unsigned long long sumsq_sse2(signed short *,int); +#endif + +#ifdef __x86_64__ +unsigned long long sumsq_sse2(signed short *,int); +#endif + +#ifdef __VEC__ +unsigned long long sumsq_av(signed short *,int); +#endif + +unsigned long long sumsq(signed short *in,int cnt){ + switch(Cpu_mode){ + case PORT: + default: + return sumsq_port(in,cnt); +#ifdef __i386__ + case SSE: + case MMX: + return sumsq_mmx(in,cnt); + case SSE2: + return sumsq_sse2(in,cnt); +#endif + +#ifdef __x86_64__ + case SSE2: + return sumsq_port(in,cnt); + //return sumsq_sse2(in,cnt); +#endif + +#ifdef __VEC__ + case ALTIVEC: + return sumsq_av(in,cnt); +#endif + } +} diff --git a/libfec/sumsq_av.c b/libfec/sumsq_av.c new file mode 100644 index 0000000..53c6acf --- /dev/null +++ b/libfec/sumsq_av.c @@ -0,0 +1,78 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * This is the Altivec SIMD version. It's a little hairy because Altivec + * does not do 64-bit operations directly, so we have to accumulate separate + * 32-bit sums and carries + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +#include "fec.h" + +unsigned long long sumsq_av(signed short *in,int cnt){ + long long sum; + vector signed short x; + vector unsigned int sums,carries,s1,s2; + int pad; + union { vector unsigned char cv; vector unsigned int iv; unsigned int w[4]; unsigned char c[16];} s; + + carries = sums = (vector unsigned int)(0); + if((pad = (int)in & 15)!=0){ + /* Load unaligned leading word */ + x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in)); + if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */ + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + } + sums = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); + in += 8-pad/2; + cnt -= 8-pad/2; + } + /* Everything is now aligned, rip through most of the block */ + while(cnt >= 8){ + x = vec_ld(0,in); + /* A single vec_msum cannot overflow, but we have to sum it with + * the earlier terms separately to handle the carries + * The cast to unsigned is OK because squares are always positive + */ + s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + in += 8; + cnt -= 8; + } + /* Handle trailing fragment, if any */ + if(cnt > 0){ + x = vec_ld(0,in); + s.c[15] = (8-cnt)<<4; + x = vec_sro(x,s.cv); + s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0)); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + } + /* Combine 4 sub-sums and carries */ + s.c[15] = 64; /* Shift right two 32-bit words */ + s1 = vec_sro(sums,s.cv); + s2 = vec_sro(carries,s.cv); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + carries = vec_add(carries,s2); + + s.c[15] = 32; /* Shift right one 32-bit word */ + s1 = vec_sro(sums,s.cv); + s2 = vec_sro(carries,s.cv); + carries = vec_add(carries,vec_addc(sums,s1)); + sums = vec_add(sums,s1); + carries = vec_add(carries,s2); + + /* Extract sum and carries from right-hand words and combine into result */ + s.iv = sums; + sum = s.w[3]; + + s.iv = carries; + sum += (long long)s.w[3] << 32; + + return sum; +} + diff --git a/libfec/sumsq_mmx.c b/libfec/sumsq_mmx.c new file mode 100644 index 0000000..e766831 --- /dev/null +++ b/libfec/sumsq_mmx.c @@ -0,0 +1,35 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * MMX-assisted version (also used on SSE) + + * The SSE2 and MMX assist routines both operate on multiples of + * 8 words; they differ only in their alignment requirements (8 bytes + * for MMX, 16 bytes for SSE2) + + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser Public License (LGPL) + */ + +long long sumsq_mmx_assist(signed short *,int); + +long long sumsq_mmx(signed short *in,int cnt){ + long long sum = 0; + + /* Handle stuff before the next 8-byte boundary */ + while(((int)in & 7) != 0 && cnt != 0){ + sum += (long)in[0] * in[0]; + in++; + cnt--; + } + sum += sumsq_mmx_assist(in,cnt); + in += cnt & ~7; + cnt &= 7; + + /* Handle up to 7 words at end */ + while(cnt != 0){ + sum += (long)in[0] * in[0]; + in++; + cnt--; + } + return sum; +} diff --git a/libfec/sumsq_mmx_assist.s b/libfec/sumsq_mmx_assist.s new file mode 100644 index 0000000..b3bac66 --- /dev/null +++ b/libfec/sumsq_mmx_assist.s @@ -0,0 +1,83 @@ +# MMX assist routines for sumsq +# Copyright 2001 Phil Karn, KA9Q +# May be used under the terms of the GNU Public License (GPL) + + .text + +# Evaluate sum of squares of signed 16-bit input samples +# long long sumsq_mmx_assist(signed short *in,int cnt); + .global sumsq_mmx_assist + .type sumsq_mmx_assist,@function + .align 16 +sumsq_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %ecx + pushl %ebx + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + xor %eax,%eax + xor %edx,%edx + + # Since 4 * 32767**2 < 2**32, we can accumulate two at a time +1: subl $8,%ecx + jl 2f + movq (%esi),%mm0 # S0 S1 S2 S3 + pmaddwd %mm0,%mm0 # (S0^2+S1^2) (S2^2+S3^2) + movq 8(%esi),%mm6 # S4 S5 S6 S7 + pmaddwd %mm6,%mm6 # (S4^2+S5^2) (S6^2+S7^2) + paddd %mm6,%mm0 # (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2) + movd %mm0,%ebx + addl %ebx,%eax + adcl $0,%edx + psrlq $32,%mm0 + movd %mm0,%ebx + addl %ebx,%eax + adcl $0,%edx + addl $16,%esi + jmp 1b + +2: emms + popl %ebx + popl %ecx + popl %esi + popl %ebp + ret + +# Evaluate sum of squares of signed 16-bit input samples +# long sumsq_wd_mmx_assist(signed short *in,int cnt); +# Quick version, only safe for small numbers of small input values... + .global sumsq_wd_mmx_assist + .type sumsq_wd_mmx_assist,@function + .align 16 +sumsq_wd_mmx_assist: + pushl %ebp + movl %esp,%ebp + pushl %esi + + movl 8(%ebp),%esi + movl 12(%ebp),%ecx + pxor %mm2,%mm2 # zero sum + +1: subl $8,%ecx + jl 2f + movq (%esi),%mm0 # S0 S1 S2 S3 + pmaddwd %mm0,%mm0 # (S0*S0+S1*S1) (S2*S2+S3*S3) + movq 8(%esi),%mm1 + pmaddwd %mm1,%mm1 + paddd %mm1,%mm2 + paddd %mm0,%mm2 # accumulate + + addl $16,%esi + jmp 1b + +2: movd %mm2,%eax # even sum + psrlq $32,%mm2 + movd %mm2,%edx # odd sum + addl %edx,%eax + emms + popl %esi + popl %ebp + ret diff --git a/libfec/sumsq_port.c b/libfec/sumsq_port.c new file mode 100644 index 0000000..6d0b4c1 --- /dev/null +++ b/libfec/sumsq_port.c @@ -0,0 +1,16 @@ +/* Compute the sum of the squares of a vector of signed shorts + + * Portable C version + * Copyright 2004 Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ + +unsigned long long sumsq_port(signed short *in,int cnt){ + long long sum = 0; + int i; + + for(i=0;i +#include +#include +#include +#include "config.h" +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +int Verbose = 0; + +int main(int argc,char *argv[]){ + signed short *buf; + int i,d,trial,trials=10000; + int bufsize = 2048; + long long port_sum,simd_sum; + time_t t; + int timetrials=0; + + find_cpu_mode(); + time(&t); + srandom(t); + +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"vapmstl:n:T",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"vapmstl:n:T")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + bufsize = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'v': + Verbose++; + break; + case 'T': + timetrials++; + break; + } + } + + buf = (signed short *)calloc(bufsize,sizeof(signed short)); + if(timetrials){ + for(trial=0;trial +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27(int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi27_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi27_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi27_mmx(len); + case SSE: + return create_viterbi27_sse(len); + case SSE2: + return create_viterbi27_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi27_port(len); +#endif + } +} + +void set_viterbi27_polynomial(int polys[2]){ + switch(Cpu_mode){ + case PORT: + default: + set_viterbi27_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi27_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi27_polynomial_mmx(polys); + break; + case SSE: + set_viterbi27_polynomial_sse(polys); + break; + case SSE2: + set_viterbi27_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi27_polynomial_port(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi27_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi27_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi27_mmx(p,starting_state); + case SSE: + return init_viterbi27_sse(p,starting_state); + case SSE2: + return init_viterbi27_sse2(p,starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi27_port(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi27( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi27_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi27_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi27_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi27_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi27_sse2(p,data,nbits,endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi27_port(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi27_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi27_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi27_mmx(p); + break; + case SSE: + delete_viterbi27_sse(p); + break; + case SSE2: + delete_viterbi27_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi27_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi27_blk(void *p,unsigned char syms[],int nbits){ + if(p == NULL) + return -1; + + switch(Cpu_mode){ + case PORT: + default: + update_viterbi27_blk_port(p,syms,nbits); + break; +#ifdef __VEC__ + case ALTIVEC: + update_viterbi27_blk_av(p,syms,nbits); + break; +#endif +#ifdef __i386__ + case MMX: + update_viterbi27_blk_mmx(p,syms,nbits); + break; + case SSE: + update_viterbi27_blk_sse(p,syms,nbits); + break; + case SSE2: + update_viterbi27_blk_sse2(p,syms,nbits); + break; +#endif +#ifdef __x86_64__ + case SSE2: + update_viterbi27_blk_port(p,syms,nbits); + break; +#endif + } + return 0; +} diff --git a/libfec/viterbi27_av.c b/libfec/viterbi27_av.c new file mode 100644 index 0000000..98d7344 --- /dev/null +++ b/libfec/viterbi27_av.c @@ -0,0 +1,210 @@ +/* K=7 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec instructions + * Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +typedef union { long long p; unsigned char c[64]; vector bool char v[4]; } decision_t; +typedef union { long long p; unsigned char c[64]; vector unsigned char v[4]; } metric_t; + +static union branchtab27 { unsigned char c[32]; vector unsigned char v[2];} Branchtab27[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_av(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<4;i++) + vp->metrics1.v[i] = (vector unsigned char)(63); + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_av(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_av(int len){ + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA,V27POLYB }; + set_viterbi27_polynomial_av(polys); + } + if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL) + return NULL; + if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27_av(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + + if(p == NULL) + return -1; + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate>>2] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_av(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* Process received symbols */ +int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits){ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + vector unsigned char survivor0,survivor1,sym0v,sym1v; + vector bool char decision0,decision1; + vector unsigned char metric,m_metric,m0,m1,m2,m3; + void *tmp; + + /* sym0v.0 = syms[0]; sym0v.1 = syms[1] */ + sym0v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms)); + + sym1v = vec_splat(sym0v,1); /* Splat syms[1] across sym1v */ + sym0v = vec_splat(sym0v,0); /* Splat syms[0] across sym0v */ + syms += 2; + + /* Do the 32 butterflies as two interleaved groups of 16 each to keep the pipes full */ + + /* Form first set of 16 branch metrics */ + metric = vec_avg(vec_xor(Branchtab27[0].v[0],sym0v),vec_xor(Branchtab27[1].v[0],sym1v)); + metric = vec_sr(metric,(vector unsigned char)(3)); + m_metric = vec_sub((vector unsigned char)(31),metric); + + /* Form first set of path metrics */ + m0 = vec_adds(vp->old_metrics->v[0],metric); + m3 = vec_adds(vp->old_metrics->v[2],metric); + m1 = vec_adds(vp->old_metrics->v[2],m_metric); + m2 = vec_adds(vp->old_metrics->v[0],m_metric); + + /* Form second set of 16 branch metrics */ + metric = vec_avg(vec_xor(Branchtab27[0].v[1],sym0v),vec_xor(Branchtab27[1].v[1],sym1v)); + metric = vec_sr(metric,(vector unsigned char)(3)); + m_metric = vec_sub((vector unsigned char)(31),metric); + + /* Compare and select first set */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Compute second set of path metrics */ + m0 = vec_adds(vp->old_metrics->v[1],metric); + m3 = vec_adds(vp->old_metrics->v[3],metric); + m1 = vec_adds(vp->old_metrics->v[3],m_metric); + m2 = vec_adds(vp->old_metrics->v[1],m_metric); + + /* Interleave and store first decisions and survivors */ + d->v[0] = vec_mergeh(decision0,decision1); + d->v[1] = vec_mergel(decision0,decision1); + vp->new_metrics->v[0] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[1] = vec_mergel(survivor0,survivor1); + + /* Compare and select second set */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Interleave and store second set of decisions and survivors */ + d->v[2] = vec_mergeh(decision0,decision1); + d->v[3] = vec_mergel(decision0,decision1); + vp->new_metrics->v[2] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[3] = vec_mergel(survivor0,survivor1); + + /* renormalize if necessary */ + if(vp->new_metrics->c[0] >= 105){ + vector unsigned char scale0,scale1; + + /* Find smallest metric and splat */ + scale0 = vec_min(vp->new_metrics->v[0],vp->new_metrics->v[1]); + scale1 = vec_min(vp->new_metrics->v[2],vp->new_metrics->v[3]); + scale0 = vec_min(scale0,scale1); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,8)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,4)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,2)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,1)); + + /* Now subtract from all metrics */ + vp->new_metrics->v[0] = vec_subs(vp->new_metrics->v[0],scale0); + vp->new_metrics->v[1] = vec_subs(vp->new_metrics->v[1],scale0); + vp->new_metrics->v[2] = vec_subs(vp->new_metrics->v[2],scale0); + vp->new_metrics->v[3] = vec_subs(vp->new_metrics->v[3],scale0); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + + return 0; +} + diff --git a/libfec/viterbi27_mmx.c b/libfec/viterbi27_mmx.c new file mode 100644 index 0000000..a6d5125 --- /dev/null +++ b/libfec/viterbi27_mmx.c @@ -0,0 +1,115 @@ +/* K=7 r=1/2 Viterbi decoder for MMX + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { char c[64]; __m64 v[8];} decision_t; +typedef union { unsigned char c[64]; __m64 v[8];} metric_t; + +unsigned char Mettab27_1[256][32] __attribute__ ((aligned(16))); +unsigned char Mettab27_2[256][32] __attribute__ ((aligned(16))); +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in mmxbfly27.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_mmx(void *p,int starting_state){ + struct v27 *vp = (struct v27 *)p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_mmx(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + int symbol; + for(symbol = 0;symbol < 256;symbol++){ + int sym; + + sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0); + Mettab27_1[symbol][state] = (sym ? (255-symbol):symbol) / 16; + + sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0); + Mettab27_2[symbol][state] = (sym ? (255-symbol):symbol) / 16; + } + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_mmx(int len){ + struct v27 *vp; + int polys[2] = { V27POLYA, V27POLYB }; + + if(Init == 0){ + set_viterbi27_polynomial_mmx(polys); + } + if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27_mmx(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + struct v27 *vp = (struct v27 *)p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate &= 63; + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate>>2] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_mmx(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/libfec/viterbi27_port.c b/libfec/viterbi27_port.c new file mode 100644 index 0000000..7cac2b3 --- /dev/null +++ b/libfec/viterbi27_port.c @@ -0,0 +1,191 @@ +/* K=7 r=1/2 Viterbi decoder in portable C + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + + +typedef union { unsigned int w[64]; } metric_t; +typedef union { unsigned long w[2];} decision_t; +static union branchtab27 { unsigned char c[32]; } Branchtab27[2] __attribute__ ((aligned(16))); +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_port(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.w[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_port(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_port(int len){ + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA, V27POLYB }; + set_viterbi27_polynomial_port(polys); + } + if((vp = malloc(sizeof(struct v27))) == NULL) + return NULL; + if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27_port(vp,0); + + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[(endstate>>2)/32] >> ((endstate>>2)%32)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_port(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned int metric,m0,m1,decision;\ + metric = (Branchtab27[0].c[i] ^ sym0) + (Branchtab27[1].c[i] ^ sym1);\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+32] + (510 - metric);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i)&31);\ + m0 -= (metric+metric-510);\ + m1 += (metric+metric-510);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i+1)&31);\ +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits){ + struct v27 *vp = p; + void *tmp; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + unsigned char sym0,sym1; + + d->w[0] = d->w[1] = 0; + sym0 = *syms++; + sym1 = *syms++; + + BFLY(0); + BFLY(1); + BFLY(2); + BFLY(3); + BFLY(4); + BFLY(5); + BFLY(6); + BFLY(7); + BFLY(8); + BFLY(9); + BFLY(10); + BFLY(11); + BFLY(12); + BFLY(13); + BFLY(14); + BFLY(15); + BFLY(16); + BFLY(17); + BFLY(18); + BFLY(19); + BFLY(20); + BFLY(21); + BFLY(22); + BFLY(23); + BFLY(24); + BFLY(25); + BFLY(26); + BFLY(27); + BFLY(28); + BFLY(29); + BFLY(30); + BFLY(31); + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/libfec/viterbi27_sse.c b/libfec/viterbi27_sse.c new file mode 100644 index 0000000..cd1f287 --- /dev/null +++ b/libfec/viterbi27_sse.c @@ -0,0 +1,113 @@ +/* K=7 r=1/2 Viterbi decoder for SSE + * Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[64]; } metric_t; +typedef union { unsigned long w[2]; unsigned char c[8]; __m64 v[1];} decision_t; +union branchtab27 { unsigned char c[32]; __m64 v[4];} Branchtab27_sse[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in ssebfly27.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_sse(int len){ + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA, V27POLYB }; + + set_viterbi27_polynomial_sse(polys); + } + if((vp = malloc(sizeof(struct v27))) == NULL) + return NULL; + if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi27(vp,0); + return vp; +} + +void set_viterbi27_polynomial_sse(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_sse(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi27_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_sse(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/libfec/viterbi27_sse2.c b/libfec/viterbi27_sse2.c new file mode 100644 index 0000000..bc01710 --- /dev/null +++ b/libfec/viterbi27_sse2.c @@ -0,0 +1,180 @@ +/* K=7 r=1/2 Viterbi decoder for SSE2 + * Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[64]; __m128i v[4]; } metric_t; +typedef union { unsigned long w[2]; unsigned char c[8]; unsigned short s[4]; __m64 v[1];} decision_t; +union branchtab27 { unsigned char c[32]; __m128i v[2];} Branchtab27_sse2[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in sse2bfly27.s! + */ +struct v27 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi27_sse2(void *p,int starting_state){ + struct v27 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<64;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi27_polynomial_sse2(int polys[2]){ + int state; + + for(state=0;state < 32;state++){ + Branchtab27_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab27_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi27_sse2(int len){ + void *p; + struct v27 *vp; + + if(!Init){ + int polys[2] = { V27POLYA, V27POLYB }; + set_viterbi27_polynomial_sse2(polys); + } + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v27))) + return NULL; + vp = (struct v27 *)p; + + if((p = malloc((len+6)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi27_sse2(vp,0); + + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi27_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 64; + endstate <<= 2; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 6; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi27_sse2(void *p){ + struct v27 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +#if 0 +/* This code is turned off because it's slower than my hand-crafted assembler in sse2bfly27.s. But it does work. */ +void update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits){ + struct v27 *vp = p; + decision_t *d; + + if(p == NULL) + return; + d = (decision_t *)vp->dp; + while(nbits--){ + __m128i sym0v,sym1v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_epi8(syms[0]); + sym1v = _mm_set1_epi8(syms[1]); + syms += 2; + + for(i=0;i<2;i++){ + __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics */ + metric = _mm_avg_epu8(_mm_xor_si128(Branchtab27_sse2[0].v[i],sym0v),_mm_xor_si128(Branchtab27_sse2[1].v[i],sym1v)); + /* There's no packed bytes right shift in SSE2, so we use the word version and mask + * (I'm *really* starting to like Altivec...) + */ + metric = _mm_srli_epi16(metric,3); + metric = _mm_and_si128(metric,_mm_set1_epi8(31)); + m_metric = _mm_sub_epi8(_mm_set1_epi8(31),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_epi8(vp->old_metrics->v[i],metric); + m3 = _mm_add_epi8(vp->old_metrics->v[2+i],metric); + m1 = _mm_add_epi8(vp->old_metrics->v[2+i],m_metric); + m2 = _mm_add_epi8(vp->old_metrics->v[i],m_metric); + + /* Compare and select, using modulo arithmetic */ + decision0 = _mm_cmpgt_epi8(_mm_sub_epi8(m0,m1),_mm_setzero_si128()); + decision1 = _mm_cmpgt_epi8(_mm_sub_epi8(m2,m3),_mm_setzero_si128()); + survivor0 = _mm_or_si128(_mm_and_si128(decision0,m1),_mm_andnot_si128(decision0,m0)); + survivor1 = _mm_or_si128(_mm_and_si128(decision1,m3),_mm_andnot_si128(decision1,m2)); + + /* Pack each set of decisions into 16 bits */ + d->s[2*i] = _mm_movemask_epi8(_mm_unpacklo_epi8(decision0,decision1)); + d->s[2*i+1] = _mm_movemask_epi8(_mm_unpackhi_epi8(decision0,decision1)); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_epi8(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi8(survivor0,survivor1); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; +} +#endif diff --git a/libfec/viterbi29.c b/libfec/viterbi29.c new file mode 100644 index 0000000..f51e356 --- /dev/null +++ b/libfec/viterbi29.c @@ -0,0 +1,178 @@ +/* Switch to K=9 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29(int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi29_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi29_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi29_mmx(len); + case SSE: + return create_viterbi29_sse(len); + case SSE2: + return create_viterbi29_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi29_port(len); +#endif + } +} + +void set_viterbi29_polynomial(int polys[2]){ + switch(Cpu_mode){ + case PORT: + default: + set_viterbi29_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi29_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi29_polynomial_mmx(polys); + break; + case SSE: + set_viterbi29_polynomial_sse(polys); + break; + case SSE2: + set_viterbi29_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi29_polynomial_port(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi29_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi29_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi29_mmx(p,starting_state); + case SSE: + return init_viterbi29_sse(p,starting_state); + case SSE2: + return init_viterbi29_sse2(p,starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi29_port(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi29( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi29_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi29_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi29_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi29_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi29_sse2(p,data,nbits,endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi29_port(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi29_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi29_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi29_mmx(p); + break; + case SSE: + delete_viterbi29_sse(p); + break; + case SSE2: + delete_viterbi29_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi29_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi29_blk(void *p,unsigned char syms[],int nbits){ + switch(Cpu_mode){ + case PORT: + default: + return update_viterbi29_blk_port(p,syms,nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi29_blk_av(p,syms,nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi29_blk_mmx(p,syms,nbits); + case SSE: + return update_viterbi29_blk_sse(p,syms,nbits); + case SSE2: + return update_viterbi29_blk_sse2(p,syms,nbits); +#endif +#ifdef __x86_64__ + case SSE2: + return update_viterbi29_blk_port(p,syms,nbits); +#endif + } +} diff --git a/libfec/viterbi29_av.c b/libfec/viterbi29_av.c new file mode 100644 index 0000000..31c8d27 --- /dev/null +++ b/libfec/viterbi29_av.c @@ -0,0 +1,190 @@ +/* K=9 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[256]; vector bool char v[16]; } decision_t; +typedef union { unsigned char c[256]; vector unsigned char v[16]; } metric_t; + +static union branchtab29 { unsigned char c[128]; vector unsigned char v[8]; } Branchtab29[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_av(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16;i++) + vp->metrics1.v[i] = (vector unsigned char)(63); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi29_polynomial_av(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_av(int len){ + struct v29 *vp; + + if(!Init){ + int polys[2] = { V29POLYA,V29POLYB }; + set_viterbi29_polynomial_av(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29_av(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi29_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_av(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits){ + struct v29 *vp = p; + decision_t *d; + int i; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + + while(nbits--){ + vector unsigned char sym1v,sym2v; + void *tmp; + + /* All this seems necessary just to load a byte into all elements of a vector! */ + sym1v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms)); /* sym1v.0 = syms[0]; sym1v.1 = syms[1] */ + sym2v = vec_splat(sym1v,1); /* Splat syms[1] across sym2v */ + sym1v = vec_splat(sym1v,0); /* Splat syms[0] across sym1v */ + syms += 2; + + for(i=0;i<8;i++){ + vector bool char decision0,decision1; + vector unsigned char metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics */ + metric = vec_avg(vec_xor(Branchtab29[0].v[i],sym1v),vec_xor(Branchtab29[1].v[i],sym2v)); + metric = vec_sr(metric,(vector unsigned char)(3)); + m_metric = (vector unsigned char)(31) - metric; + + /* Add branch metrics to path metrics */ + m0 = vec_adds(vp->old_metrics->v[i],metric); + m3 = vec_adds(vp->old_metrics->v[8+i],metric); + m1 = vec_adds(vp->old_metrics->v[8+i],m_metric); + m2 = vec_adds(vp->old_metrics->v[i],m_metric); + + /* Compare and select first set */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Interleave and store decisions and survivors */ + d->v[2*i] = vec_mergeh(decision0,decision1); + d->v[2*i+1] = vec_mergel(decision0,decision1); + vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); + } + d++; + /* renormalize if necessary */ + if(vp->new_metrics->c[0] >= 50){ + int i; + vector unsigned char scale0,scale1; + + /* Find smallest metric and splat */ + scale0 = vp->new_metrics->v[0]; + scale1 = vp->new_metrics->v[1]; + for(i=2;i<16;i+=2){ + scale0 = vec_min(scale0,vp->new_metrics->v[i]); + scale1 = vec_min(scale1,vp->new_metrics->v[i+1]); + } + scale0 = vec_min(scale0,scale1); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,8)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,4)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,2)); + scale0 = vec_min(scale0,vec_sld(scale0,scale0,1)); + + /* Now subtract from all metrics */ + for(i=0;i<16;i++) + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale0); + } + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/libfec/viterbi29_mmx.c b/libfec/viterbi29_mmx.c new file mode 100644 index 0000000..563f40a --- /dev/null +++ b/libfec/viterbi29_mmx.c @@ -0,0 +1,118 @@ +/* K=9 r=1/2 Viterbi decoder for MMX + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { char c[256]; __m64 v[32];} decision_t; +typedef union { unsigned char c[256]; __m64 v[32];} metric_t; + +unsigned char Mettab29_1[256][128] __attribute__ ((aligned(8))); +unsigned char Mettab29_2[256][128] __attribute__ ((aligned(8))); +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in mmxbfly29.s! + */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_mmx(int len){ + struct v29 *vp; + + if(Init == 0){ + int polys[2] = {V29POLYA,V29POLYB}; + + set_viterbi29_polynomial_mmx(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29(vp,0); + return vp; +} + +void set_viterbi29_polynomial_mmx(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + int symbol; + + for(symbol = 0;symbol < 256;symbol++){ + int sym; + + sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0); + Mettab29_1[symbol][state] = (sym ? (255-symbol):symbol) / 16; + + sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0); + Mettab29_2[symbol][state] = (sym ? (255-symbol):symbol) / 16; + } + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_mmx(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi29_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + struct v29 *vp = (struct v29 *)p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->decisions; + endstate &= 255; + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_mmx(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/libfec/viterbi29_port.c b/libfec/viterbi29_port.c new file mode 100644 index 0000000..292dce8 --- /dev/null +++ b/libfec/viterbi29_port.c @@ -0,0 +1,166 @@ +/* K=9 r=1/2 Viterbi decoder in portable C + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include "fec.h" + +typedef union { unsigned int w[256]; } metric_t; +typedef union { unsigned long w[8];} decision_t; + +static union { unsigned char c[128]; } Branchtab29[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_port(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.w[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi29_polynomial_port(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_port(int len){ + struct v29 *vp; + + if(!Init){ + int polys[2] = {V29POLYA,V29POLYB}; + set_viterbi29_polynomial_port(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29_port(vp,0); + + return vp; +} + + +/* Viterbi chainback */ +int chainback_viterbi29_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_port(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned int metric,m0,m1,decision;\ + metric = (Branchtab29[0].c[i] ^ sym0) + (Branchtab29[1].c[i] ^ sym1);\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+128] + (510 - metric);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i)&31);\ + m0 -= (metric+metric-510);\ + m1 += (metric+metric-510);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i+1)&31);\ +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ + +int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits){ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + while(nbits--){ + void *tmp; + unsigned char sym0,sym1; + int i; + + for(i=0;i<8;i++) + d->w[i] = 0; + sym0 = *syms++; + sym1 = *syms++; + + for(i=0;i<128;i++) + BFLY(i); + + d++; + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/libfec/viterbi29_sse.c b/libfec/viterbi29_sse.c new file mode 100644 index 0000000..4a92e5f --- /dev/null +++ b/libfec/viterbi29_sse.c @@ -0,0 +1,114 @@ +/* K=9 r=1/2 Viterbi decoder for SSE + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char w[256]; __m64 v[32];} metric_t; +typedef union { unsigned long w[8]; unsigned char c[32]; __m64 v[4];} decision_t; + +union branchtab29 { unsigned char c[128]; } Branchtab29_sse[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s! + */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_sse(int len){ + struct v29 *vp; + + if(!Init){ + int polys[2] = { V29POLYA,V29POLYB }; + + set_viterbi29_polynomial_sse(polys); + } + if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL) + return NULL; + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi29(vp,0); + return vp; +} + +void set_viterbi29_polynomial_sse(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_sse(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.w[i] = 200; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi29_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_sse(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/libfec/viterbi29_sse2.c b/libfec/viterbi29_sse2.c new file mode 100644 index 0000000..4c7336c --- /dev/null +++ b/libfec/viterbi29_sse2.c @@ -0,0 +1,119 @@ +/* K=9 r=1/2 Viterbi decoder for SSE2 + * Copyright Feb 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[256]; __m128i v[16];} metric_t; +typedef union { unsigned long w[8]; unsigned char c[32];} decision_t; + +union branchtab29 { unsigned char c[128]; } Branchtab29_sse2[2]; +static int Init = 0; + +/* State info for instance of Viterbi decoder + * Don't change this without also changing references in sse2bfly29.s! + */ +struct v29 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi29_sse2(void *p,int starting_state){ + struct v29 *vp = p; + int i; + + for(i=0;i<256;i++) + vp->metrics1.c[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi29_polynomial_sse2(int polys[2]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab29_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab29_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + } + Init++; +} + + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi29_sse2(int len){ + void *p; + struct v29 *vp; + + if(!Init){ + int polys[2] = {V29POLYA,V29POLYB}; + + set_viterbi29_polynomial(polys); + } + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v29))) + return NULL; + vp = (struct v29 *)p; + if((p = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi29_sse2(vp,0); + return vp; +} + + +/* Viterbi chainback */ +int chainback_viterbi29_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v29 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = vp->decisions; + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi29_sse2(void *p){ + struct v29 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} diff --git a/libfec/viterbi39.c b/libfec/viterbi39.c new file mode 100644 index 0000000..d2e65f4 --- /dev/null +++ b/libfec/viterbi39.c @@ -0,0 +1,179 @@ +/* Switch to K=9 r=1/3 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Aug 2006, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39(int len){ + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi39_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi39_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi39_mmx(len); + case SSE: + return create_viterbi39_sse(len); + case SSE2: + return create_viterbi39_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi39_port(len); +#endif + } +} + +void set_viterbi39_polynomial(int polys[3]){ + switch(Cpu_mode){ + case PORT: + default: + set_viterbi39_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi39_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi39_polynomial_mmx(polys); + break; + case SSE: + set_viterbi39_polynomial_sse(polys); + break; + case SSE2: + set_viterbi39_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi39_polynomial_port(polys); + break; +#endif + } +} + + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi39_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi39_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi39_mmx(p,starting_state); + case SSE: + return init_viterbi39_sse(p,starting_state); + case SSE2: + return init_viterbi39_sse2(p,starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi39_port(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi39( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi39_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi39_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi39_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi39_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi39_sse2(p,data,nbits,endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi39_port(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi39_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi39_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi39_mmx(p); + break; + case SSE: + delete_viterbi39_sse(p); + break; + case SSE2: + delete_viterbi39_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi39_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi39_blk(void *p,unsigned char syms[],int nbits){ + switch(Cpu_mode){ + case PORT: + default: + return update_viterbi39_blk_port(p,syms,nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi39_blk_av(p,syms,nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi39_blk_mmx(p,syms,nbits); + case SSE: + return update_viterbi39_blk_sse(p,syms,nbits); + case SSE2: + return update_viterbi39_blk_sse2(p,syms,nbits); +#endif +#ifdef __x86_64__ + case SSE2: + return update_viterbi39_blk_port(p,syms,nbits); +#endif + } +} diff --git a/libfec/viterbi39_av.c b/libfec/viterbi39_av.c new file mode 100644 index 0000000..2deed51 --- /dev/null +++ b/libfec/viterbi39_av.c @@ -0,0 +1,251 @@ +/* K=9 r=1/3 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions + * 8-bit offset-binary soft decision samples + * Copyright Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[2][16]; vector unsigned char v[2]; } decision_t; +typedef union { unsigned short s[256]; vector unsigned short v[32]; } metric_t; + +static union branchtab39 { unsigned short s[128]; vector unsigned short v[16];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_av(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + for(i=0;i<32;i++) + vp->metrics1.v[i] = (vector unsigned short)(1000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi39_polynomial_av(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_av(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; + + set_viterbi39_polynomial_av(polys); + } + vp = (struct v39 *)malloc(sizeof(struct v39)); + vp->decisions = malloc(sizeof(decision_t)*(len+8)); + init_viterbi39_av(vp,0); + return vp; +} + +/* Viterbi chainback */ +int chainback_viterbi39_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + int path_metric; + + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_av(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->dp; + int path_metric = 0; + vector unsigned char decisions = (vector unsigned char)(0); + + while(nbits--){ + vector unsigned short symv,sym0v,sym1v,sym2v; + vector unsigned char s; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms)); + + symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s); /* Unsigned byte->word unpack */ + sym0v = vec_splat(symv,0); + sym1v = vec_splat(symv,1); + sym2v = vec_splat(symv,2); + syms += 3; + + for(i=0;i<16;i++){ + vector bool short decision0,decision1; + vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * the metrics are in the range 0-765 + */ + m0 = vec_add(vec_xor(Branchtab39[0].v[i],sym0v),vec_xor(Branchtab39[1].v[i],sym1v)); + m1 = vec_xor(Branchtab39[2].v[i],sym2v); + metric = vec_add(m0,m1); + m_metric = vec_sub((vector unsigned short)(765),metric); + + /* Add branch metrics to path metrics */ + m0 = vec_adds(vp->old_metrics->v[i],metric); + m3 = vec_adds(vp->old_metrics->v[16+i],metric); + m1 = vec_adds(vp->old_metrics->v[16+i],m_metric); + m2 = vec_adds(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Store decisions and survivors. + * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in + * a funny interleaved fashion that we undo in the chainback function. + */ + decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */ + + /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting + * 0xff is equivalent to adding 1, which sets the lsb. + */ + decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1))); + + vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); + + if((i % 8) == 7){ + /* We've accumulated a total of 128 decisions, stash and start again */ + d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */ + } + } +#if 0 + /* Experimentally determine metric spread + * The results are fixed for a given code and input symbol size + */ + { + int i; + vector unsigned short min_metric; + vector unsigned short max_metric; + union { vector unsigned short v; unsigned short s[8];} t; + int minimum,maximum; + static int max_spread = 0; + + min_metric = max_metric = vp->new_metrics->v[0]; + for(i=1;i<32;i++){ + min_metric = vec_min(min_metric,vp->new_metrics->v[i]); + max_metric = vec_max(max_metric,vp->new_metrics->v[i]); + } + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2)); + + t.v = min_metric; + minimum = t.s[0]; + t.v = max_metric; + maximum = t.s[0]; + if(maximum-minimum > max_spread){ + max_spread = maximum-minimum; + printf("metric spread = %d\n",max_spread); + } + } +#endif + + /* Renormalize if necessary. This deserves some explanation. + * The maximum possible spread, found by experiment, for 8 bit symbols is about 3825 + * So by looking at one arbitrary metric we can tell if any of them have possibly saturated. + * However, this is very conservative. Large spreads occur only at very high Eb/No, where + * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor. + + * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric + * by not not normalizing when we should are extremely low. So either way, the risk to performance is small. + + * All this is borne out by experiment. + */ + if(vp->new_metrics->s[0] >= USHRT_MAX-5000){ + vector unsigned short scale; + union { vector unsigned short v; unsigned short s[8];} t; + + /* Find smallest metric and splat */ + scale = vp->new_metrics->v[0]; + for(i=1;i<32;i++) + scale = vec_min(scale,vp->new_metrics->v[i]); + + scale = vec_min(scale,vec_sld(scale,scale,8)); + scale = vec_min(scale,vec_sld(scale,scale,4)); + scale = vec_min(scale,vec_sld(scale,scale,2)); + + /* Subtract it from all metrics + * Work backwards to try to improve the cache hit ratio, assuming LRU + */ + for(i=31;i>=0;i--) + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale); + t.v = scale; + path_metric += t.s[0]; + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return path_metric; +} diff --git a/libfec/viterbi39_mmx.c b/libfec/viterbi39_mmx.c new file mode 100644 index 0000000..875391a --- /dev/null +++ b/libfec/viterbi39_mmx.c @@ -0,0 +1,185 @@ +/* K=9 r=1/3 Viterbi decoder for x86 MMX + * Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[256]; __m64 v[32];} decision_t; +typedef union { unsigned short s[256]; __m64 v[64];} metric_t; + +static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_mmx(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.s[i] = 1000; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi39_polynomial_mmx(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_mmx(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA,V39POLYB,V39POLYC }; + set_viterbi39_polynomial_mmx(polys); + } + if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL) + return NULL; + if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi39_mmx(vp,0); + return vp; +} + + + +/* Viterbi chainback */ +int chainback_viterbi39_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d; + int path_metric; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->decisions; + + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_mmx(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d; + int path_metric = 0; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + + while(nbits--){ + __m64 sym0v,sym1v,sym2v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + syms += 3; + + for(i=0;i<32;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v)); + metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0); + m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_add_pi16(vp->old_metrics->v[32+i],metric); + m1 = _mm_add_pi16(vp->old_metrics->v[32+i],m_metric); + m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select + * There's no packed min instruction in MMX, so we use modulo arithmetic + * to form the decisions and then do the select the hard way + */ + decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64()); + decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64()); + survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0)); + survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2)); + + /* Merge decisions and store as bytes */ + d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + if(vp->new_metrics->s[0] < vp->old_metrics->s[0]) + path_metric += 65536; /* Hack: wraparound probably occured */ + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return path_metric; +} diff --git a/libfec/viterbi39_port.c b/libfec/viterbi39_port.c new file mode 100644 index 0000000..5685c90 --- /dev/null +++ b/libfec/viterbi39_port.c @@ -0,0 +1,168 @@ +/* K=9 r=1/3 Viterbi decoder in portable C + * Copyright Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include "fec.h" + +typedef union { unsigned int w[256]; } metric_t; +typedef union { unsigned long w[8];} decision_t; + +static union { unsigned char c[128]; } Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_port(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.w[i] = 63; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */ + return 0; +} + +void set_viterbi39_polynomial_port(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0; + Branchtab39[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0; + Branchtab39[2].c[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0; + } + Init++; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_port(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = {V39POLYA,V39POLYB,V39POLYC}; + set_viterbi39_polynomial_port(polys); + } + if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL) + return NULL; + + if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi39_port(vp,0); + + return vp; +} + + +/* Viterbi chainback */ +int chainback_viterbi39_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = vp->decisions; + /* Make room beyond the end of the encoder register so we can + * accumulate a full byte of decoded data + */ + endstate %= 256; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1; + data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); + } + return 0; +} + + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_port(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned int metric,m0,m1,decision;\ + metric = (Branchtab39[0].c[i] ^ sym0) + (Branchtab39[1].c[i] ^ sym1) + \ + (Branchtab39[2].c[i] ^ sym2);\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+128] + (765 - metric);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i)&31);\ + m0 -= (metric+metric-765);\ + m1 += (metric+metric-765);\ + decision = (signed int)(m0-m1) > 0;\ + vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\ + d->w[i/16] |= decision << ((2*i+1)&31);\ +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ + +int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + while(nbits--){ + void *tmp; + unsigned char sym0,sym1,sym2; + int i; + + for(i=0;i<8;i++) + d->w[i] = 0; + sym0 = *syms++; + sym1 = *syms++; + sym2 = *syms++; + + for(i=0;i<128;i++) + BFLY(i); + + d++; + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} diff --git a/libfec/viterbi39_sse.c b/libfec/viterbi39_sse.c new file mode 100644 index 0000000..c2f2865 --- /dev/null +++ b/libfec/viterbi39_sse.c @@ -0,0 +1,201 @@ +/* K=9 r=1/3 Viterbi decoder for x86 SSE + * Copyright Aug 2006, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned long w[8]; unsigned char c[32];} decision_t; +typedef union { signed short s[256]; __m64 v[64];} metric_t; + +static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_sse(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<256;i++) + vp->metrics1.s[i] = (SHRT_MIN+1000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_sse(int len){ + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; + + set_viterbi39_polynomial_sse(polys); + } + if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL){ + return NULL; + } + if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi39_sse(vp,0); + return vp; +} + +void set_viterbi39_polynomial_sse(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi39_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d; + int path_metric; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/ + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric - SHRT_MIN; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_sse(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d; + int path_metric = 0; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + __m64 sym0v,sym1v,sym2v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + syms += 3; + + for(i=0;i<32;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-765 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v)); + metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0); + m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_pi16(vp->old_metrics->v[32+i],metric); + m1 = _mm_adds_pi16(vp->old_metrics->v[32+i],m_metric); + m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_pi16(m0,m1); + survivor1 = _mm_min_pi16(m2,m3); + decision0 = _mm_cmpeq_pi16(survivor0,m1); + decision1 = _mm_cmpeq_pi16(survivor1,m3); + + /* Pack decisions into 8 bits and store */ + d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-255 branch metrics is 12750 + */ + if(vp->new_metrics->s[0] >= SHRT_MAX-5000){ + int i,adjust; + __m64 adjustv; + union { __m64 v; signed short w[4]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<64;i++) + adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32)); + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + path_metric += adjust; + adjustv = _mm_set1_pi16(adjust); + + for(i=0;i<64;i++) + vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return path_metric; +} diff --git a/libfec/viterbi39_sse2.c b/libfec/viterbi39_sse2.c new file mode 100644 index 0000000..f13794e --- /dev/null +++ b/libfec/viterbi39_sse2.c @@ -0,0 +1,200 @@ +/* K=15 r=1/6 Viterbi decoder for x86 SSE2 + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned long w[8]; unsigned short s[16];} decision_t; +typedef union { signed short s[256]; __m128i v[32];} metric_t; + +static union branchtab39 { unsigned short s[128]; __m128i v[16];} Branchtab39[3]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v39 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi39_sse2(void *p,int starting_state){ + struct v39 *vp = p; + int i; + + for(i=0;i<256;i++) + vp->metrics1.s[i] = (SHRT_MIN+1000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi39_sse2(int len){ + void *p; + struct v39 *vp; + + if(!Init){ + int polys[3] = { V39POLYA, V39POLYB, V39POLYC }; + + set_viterbi39_polynomial_sse2(polys); + } + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v39))) + return NULL; + + vp = (struct v39 *)p; + if((p = malloc((len+8)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi39_sse2(vp,0); + return vp; +} + +void set_viterbi39_polynomial_sse2(int polys[3]){ + int state; + + for(state=0;state < 128;state++){ + Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0; + Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0; + Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi39_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + int path_metric; + + endstate %= 256; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 8; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1; + endstate = (k << 7) | (endstate >> 1); + data[nbits>>3] = endstate; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi39_sse2(void *p){ + struct v39 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){ + struct v39 *vp = p; + decision_t *d = (decision_t *)vp->dp; + int path_metric = 0; + + while(nbits--){ + __m128i sym0v,sym1v,sym2v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_epi16(syms[0]); + sym1v = _mm_set1_epi16(syms[1]); + sym2v = _mm_set1_epi16(syms[2]); + syms += 3; + + /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */ + for(i=0;i<16;i++){ + __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-765 + */ + m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i],sym0v),_mm_xor_si128(Branchtab39[1].v[i],sym1v)); + metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i],sym2v),m0); + m_metric = _mm_sub_epi16(_mm_set1_epi16(765),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_epi16(vp->old_metrics->v[16+i],metric); + m1 = _mm_adds_epi16(vp->old_metrics->v[16+i],m_metric); + m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_epi16(m0,m1); + survivor1 = _mm_min_epi16(m2,m3); + decision0 = _mm_cmpeq_epi16(survivor0,m1); + decision1 = _mm_cmpeq_epi16(survivor1,m3); + + /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */ + d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1); + } + /* See if we need to renormalize */ + if(vp->new_metrics->s[0] >= SHRT_MAX-5000){ + int i,adjust; + __m128i adjustv; + union { __m128i v; signed short w[8]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<32;i++) + adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + path_metric += adjust; + adjustv = _mm_set1_epi16(adjust); + + /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX + * This is okay since it can't overflow anyway + */ + for(i=0;i<32;i++) + vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return path_metric; +} + + diff --git a/libfec/viterbi615.c b/libfec/viterbi615.c new file mode 100644 index 0000000..ec2fb3c --- /dev/null +++ b/libfec/viterbi615.c @@ -0,0 +1,181 @@ +/* K=15 r=1/6 Viterbi decoder with optional Intel or PowerPC SIMD + * Copyright Feb 2004, Phil Karn, KA9Q + */ +#include +#include +#include +#include "fec.h" + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615(int len){ + + find_cpu_mode(); + + switch(Cpu_mode){ + case PORT: + default: + return create_viterbi615_port(len); +#ifdef __VEC__ + case ALTIVEC: + return create_viterbi615_av(len); +#endif +#ifdef __i386__ + case MMX: + return create_viterbi615_mmx(len); + case SSE: + return create_viterbi615_sse(len); + case SSE2: + return create_viterbi615_sse2(len); +#endif +#ifdef __x86_64__ + case SSE2: + return create_viterbi615_port(len); +#endif + } +} + +void set_viterbi615_polynomial(int polys[6]){ + + switch(Cpu_mode){ + case PORT: + default: + set_viterbi615_polynomial_port(polys); + break; +#ifdef __VEC__ + case ALTIVEC: + set_viterbi615_polynomial_av(polys); + break; +#endif +#ifdef __i386__ + case MMX: + set_viterbi615_polynomial_mmx(polys); + break; + case SSE: + set_viterbi615_polynomial_sse(polys); + break; + case SSE2: + set_viterbi615_polynomial_sse2(polys); + break; +#endif +#ifdef __x86_64__ + case SSE2: + set_viterbi615_polynomial_port(polys); + break; +#endif + } +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615(void *p,int starting_state){ + switch(Cpu_mode){ + case PORT: + default: + return init_viterbi615_port(p,starting_state); +#ifdef __VEC__ + case ALTIVEC: + return init_viterbi615_av(p,starting_state); +#endif +#ifdef __i386__ + case MMX: + return init_viterbi615_mmx(p,starting_state); + case SSE: + return init_viterbi615_sse(p,starting_state); + case SSE2: + return init_viterbi615_sse2(p,starting_state); +#endif +#ifdef __x86_64__ + case SSE2: + return init_viterbi615_port(p,starting_state); +#endif + } +} + +/* Viterbi chainback */ +int chainback_viterbi615( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + + switch(Cpu_mode){ + case PORT: + default: + return chainback_viterbi615_port(p,data,nbits,endstate); +#ifdef __VEC__ + case ALTIVEC: + return chainback_viterbi615_av(p,data,nbits,endstate); +#endif +#ifdef __i386__ + case MMX: + return chainback_viterbi615_mmx(p,data,nbits,endstate); + case SSE: + return chainback_viterbi615_sse(p,data,nbits,endstate); + case SSE2: + return chainback_viterbi615_sse2(p,data,nbits,endstate); +#endif +#ifdef __x86_64__ + case SSE2: + return chainback_viterbi615_port(p,data,nbits,endstate); +#endif + } +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615(void *p){ + switch(Cpu_mode){ + case PORT: + default: + delete_viterbi615_port(p); + break; +#ifdef __VEC__ + case ALTIVEC: + delete_viterbi615_av(p); + break; +#endif +#ifdef __i386__ + case MMX: + delete_viterbi615_mmx(p); + break; + case SSE: + delete_viterbi615_sse(p); + break; + case SSE2: + delete_viterbi615_sse2(p); + break; +#endif +#ifdef __x86_64__ + case SSE2: + delete_viterbi615_port(p); + break; +#endif + } +} + +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ +int update_viterbi615_blk(void *p,unsigned char syms[],int nbits){ + switch(Cpu_mode){ + case PORT: + default: + return update_viterbi615_blk_port(p,syms,nbits); +#ifdef __VEC__ + case ALTIVEC: + return update_viterbi615_blk_av(p,syms,nbits); +#endif +#ifdef __i386__ + case MMX: + return update_viterbi615_blk_mmx(p,syms,nbits); + case SSE: + return update_viterbi615_blk_sse(p,syms,nbits); + case SSE2: + return update_viterbi615_blk_sse2(p,syms,nbits); +#endif +#ifdef __x86_64__ + case SSE2: + return update_viterbi615_blk_port(p,syms,nbits); +#endif + } +} + diff --git a/libfec/viterbi615_av.c b/libfec/viterbi615_av.c new file mode 100644 index 0000000..4a6ce9c --- /dev/null +++ b/libfec/viterbi615_av.c @@ -0,0 +1,257 @@ +/* K=15 r=1/6 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions + * 8-bit offset-binary soft decision samples + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[128][16]; vector unsigned char v[128]; } decision_t; +typedef union { unsigned short s[16384]; vector unsigned short v[2048]; } metric_t; + +static union branchtab615 { unsigned short s[8192]; vector unsigned short v[1024];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_av(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + + for(i=0;i<2048;i++) + vp->metrics1.v[i] = (vector unsigned short)(5000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_av(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_av(polys); + } + vp = (struct v615 *)malloc(sizeof(struct v615)); + vp->decisions = malloc(sizeof(decision_t)*(len+14)); + init_viterbi615_av(vp,0); + return vp; +} + +void set_viterbi615_polynomial_av(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + + +/* Viterbi chainback */ +int chainback_viterbi615_av( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + int path_metric; + + endstate %= 16384; + + path_metric = vp->old_metrics->s[endstate]; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return path_metric; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_av(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->dp; + int path_metric = 0; + vector unsigned char decisions = (vector unsigned char)(0); + + while(nbits--){ + vector unsigned short symv,sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + vector unsigned char s; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms)); + + symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s); /* Unsigned byte->word unpack */ + sym0v = vec_splat(symv,0); + sym1v = vec_splat(symv,1); + sym2v = vec_splat(symv,2); + sym3v = vec_splat(symv,3); + sym4v = vec_splat(symv,4); + sym5v = vec_splat(symv,5); + syms += 6; + + for(i=0;i<1024;i++){ + vector bool short decision0,decision1; + vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = vec_add(vec_xor(Branchtab615[0].v[i],sym0v),vec_xor(Branchtab615[1].v[i],sym1v)); + m1 = vec_add(vec_xor(Branchtab615[2].v[i],sym2v),vec_xor(Branchtab615[3].v[i],sym3v)); + m2 = vec_add(vec_xor(Branchtab615[4].v[i],sym4v),vec_xor(Branchtab615[5].v[i],sym5v)); + metric = vec_add(m0,m1); + metric = vec_add(metric,m2); + m_metric = vec_sub((vector unsigned short)(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = vec_adds(vp->old_metrics->v[i],metric); + m3 = vec_adds(vp->old_metrics->v[1024+i],metric); + m1 = vec_adds(vp->old_metrics->v[1024+i],m_metric); + m2 = vec_adds(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + decision0 = vec_cmpgt(m0,m1); + decision1 = vec_cmpgt(m2,m3); + survivor0 = vec_min(m0,m1); + survivor1 = vec_min(m2,m3); + + /* Store decisions and survivors. + * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in + * a funny interleaved fashion that we undo in the chainback function. + */ + decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */ + + /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting + * 0xff is equivalent to adding 1, which sets the lsb. + */ + decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1))); + + vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1); + + if((i % 8) == 7){ + /* We've accumulated a total of 128 decisions, stash and start again */ + d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */ + } + } +#if 0 + /* Experimentally determine metric spread + * The results are fixed for a given code and input symbol size + */ + { + int i; + vector unsigned short min_metric; + vector unsigned short max_metric; + union { vector unsigned short v; unsigned short s[8];} t; + int minimum,maximum; + static int max_spread = 0; + + min_metric = max_metric = vp->new_metrics->v[0]; + for(i=1;i<2048;i++){ + min_metric = vec_min(min_metric,vp->new_metrics->v[i]); + max_metric = vec_max(max_metric,vp->new_metrics->v[i]); + } + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4)); + min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2)); + max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2)); + + t.v = min_metric; + minimum = t.s[0]; + t.v = max_metric; + maximum = t.s[0]; + if(maximum-minimum > max_spread){ + max_spread = maximum-minimum; + printf("metric spread = %d\n",max_spread); + } + } +#endif + + /* Renormalize if necessary. This deserves some explanation. + + * The maximum possible spread, found by experiment, for 4-bit symbols is 405; for 8 bit symbols, it's 12750. + * So by looking at one arbitrary metric we can tell if any of them have possibly saturated. + * However, this is very conservative. Large spreads occur only at very high Eb/No, where + * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor. + + * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric + * by not not normalizing when we should are extremely low. So either way, the risk to performance is small. + + * All this is borne out by experiment. + */ + if(vp->new_metrics->s[0] >= USHRT_MAX-12750){ + vector unsigned short scale; + union { vector unsigned short v; unsigned short s[8];} t; + + /* Find smallest metric and splat */ + scale = vp->new_metrics->v[0]; + for(i=1;i<2048;i++) + scale = vec_min(scale,vp->new_metrics->v[i]); + + scale = vec_min(scale,vec_sld(scale,scale,8)); + scale = vec_min(scale,vec_sld(scale,scale,4)); + scale = vec_min(scale,vec_sld(scale,scale,2)); + + /* Subtract it from all metrics + * Work backwards to try to improve the cache hit ratio, assuming LRU + */ + for(i=2047;i>=0;i--) + vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale); + t.v = scale; + path_metric += t.s[0]; + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return path_metric; +} diff --git a/libfec/viterbi615_mmx.c b/libfec/viterbi615_mmx.c new file mode 100644 index 0000000..89a56f7 --- /dev/null +++ b/libfec/viterbi615_mmx.c @@ -0,0 +1,183 @@ +/* K=15 r=1/6 Viterbi decoder for x86 MMX + * Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned char c[16384]; __m64 v[2048];} decision_t; +typedef union { unsigned short s[16384]; __m64 v[4096];} metric_t; + +static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_mmx(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.s[i] = 5000; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_mmx(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_mmx(polys); + } + + if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL) + return NULL; + if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi615_mmx(vp,0); + return vp; +} + +void set_viterbi615_polynomial_mmx(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_mmx( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->decisions; + + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = d[nbits].c[endstate] & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_mmx(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + + d = (decision_t *)vp->dp; + + while(nbits--){ + __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + sym3v = _mm_set1_pi16(syms[3]); + sym4v = _mm_set1_pi16(syms[4]); + sym5v = _mm_set1_pi16(syms[5]); + syms += 6; + + for(i=0;i<2048;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v)); + m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v)); + m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v)); + metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2)); + m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_add_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_add_pi16(vp->old_metrics->v[2048+i],metric); + m1 = _mm_add_pi16(vp->old_metrics->v[2048+i],m_metric); + m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select + * There's no packed min instruction in MMX, so we use modulo arithmetic + * to form the decisions and then do the select the hard way + */ + decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64()); + decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64()); + survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0)); + survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2)); + + /* Merge decisions and store as bytes */ + d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return 0; +} diff --git a/libfec/viterbi615_port.c b/libfec/viterbi615_port.c new file mode 100644 index 0000000..89bdd80 --- /dev/null +++ b/libfec/viterbi615_port.c @@ -0,0 +1,156 @@ +/* K=15 r=1/6 Viterbi decoder in portable C + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t; +typedef union { unsigned long w[16384]; } metric_t; + +static union branchtab615 { unsigned long w[8192]; } Branchtab615[6] __attribute__ ((aligned(16))); +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + decision_t *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + decision_t *decisions; /* Beginning of decisions for block */ +}; + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_port(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_port(polys); + } + if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL) + return NULL; + if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi615(vp,0); + return vp; +} + +void set_viterbi615_polynomial_port(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].w[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_port(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.w[i] = 1000; + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->w[starting_state & 16383] = 0; /* Bias known start state */ + return 0; +} + +/* Viterbi chainback */ +int chainback_viterbi615_port( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_port(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + +/* C-language butterfly */ +#define BFLY(i) {\ +unsigned long metric,m0,m1,m2,m3,decision0,decision1;\ + metric = ((Branchtab615[0].w[i] ^ syms[0]) + (Branchtab615[1].w[i] ^ syms[1])\ + +(Branchtab615[2].w[i] ^ syms[2]) + (Branchtab615[3].w[i] ^ syms[3])\ + +(Branchtab615[4].w[i] ^ syms[4]) + (Branchtab615[5].w[i] ^ syms[5]));\ + m0 = vp->old_metrics->w[i] + metric;\ + m1 = vp->old_metrics->w[i+8192] + (1530 - metric);\ + m2 = vp->old_metrics->w[i] + (1530-metric);\ + m3 = vp->old_metrics->w[i+8192] + metric;\ + decision0 = (signed long)(m0-m1) >= 0;\ + decision1 = (signed long)(m2-m3) >= 0;\ + vp->new_metrics->w[2*i] = decision0 ? m1 : m0;\ + vp->new_metrics->w[2*i+1] = decision1 ? m3 : m2;\ + d->c[i/4] |= ((decision0|(decision1<<1)) << ((2*i)&7));\ +} +/* Update decoder with a block of demodulated symbols + * Note that nbits is the number of decoded data bits, not the number + * of symbols! + */ + +int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + void *tmp; + decision_t *d; + int i; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + memset(d,0,sizeof(decision_t)); + for(i=0;i<8192;i++) + BFLY(i); + + syms += 6; + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} + diff --git a/libfec/viterbi615_sse.c b/libfec/viterbi615_sse.c new file mode 100644 index 0000000..de0f8af --- /dev/null +++ b/libfec/viterbi615_sse.c @@ -0,0 +1,201 @@ +/* K=15 r=1/6 Viterbi decoder for x86 SSE + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t; +typedef union { signed short s[16384]; __m64 v[4096];} metric_t; + +static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_sse(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.s[i] = (SHRT_MIN+5000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_sse(int len){ + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_sse(polys); + } + + if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL){ + return NULL; + } + if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + init_viterbi615_sse(vp,0); + return vp; +} + +void set_viterbi615_polynomial_sse(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_sse( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->decisions; + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + /* k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/ + k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_sse(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d; + + if(p == NULL) + return -1; + d = (decision_t *)vp->dp; + while(nbits--){ + __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_pi16(syms[0]); + sym1v = _mm_set1_pi16(syms[1]); + sym2v = _mm_set1_pi16(syms[2]); + sym3v = _mm_set1_pi16(syms[3]); + sym4v = _mm_set1_pi16(syms[4]); + sym5v = _mm_set1_pi16(syms[5]); + syms += 6; + + for(i=0;i<2048;i++){ + __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v)); + m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v)); + m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v)); + metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2)); + m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_pi16(vp->old_metrics->v[2048+i],metric); + m1 = _mm_adds_pi16(vp->old_metrics->v[2048+i],m_metric); + m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_pi16(m0,m1); + survivor1 = _mm_min_pi16(m2,m3); + decision0 = _mm_cmpeq_pi16(survivor0,m1); + decision1 = _mm_cmpeq_pi16(survivor1,m3); + + /* Pack decisions into 8 bits and store */ + d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-255 branch metrics is 12750 + */ + if(vp->new_metrics->s[0] >= SHRT_MAX-12750){ + int i,adjust; + __m64 adjustv; + union { __m64 v; signed short w[4]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<4096;i++) + adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32)); + adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + adjustv = _mm_set1_pi16(adjust); + + for(i=0;i<4096;i++) + vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + _mm_empty(); + return 0; +} diff --git a/libfec/viterbi615_sse2.c b/libfec/viterbi615_sse2.c new file mode 100644 index 0000000..7f711e5 --- /dev/null +++ b/libfec/viterbi615_sse2.c @@ -0,0 +1,204 @@ +/* K=15 r=1/6 Viterbi decoder for x86 SSE2 + * Copyright Mar 2004, Phil Karn, KA9Q + * May be used under the terms of the GNU Lesser General Public License (LGPL) + */ +#include +#include +#include +#include +#include +#include "fec.h" + +typedef union { unsigned long w[512]; unsigned short s[1024];} decision_t; +typedef union { signed short s[16384]; __m128i v[2048];} metric_t; + +static union branchtab615 { unsigned short s[8192]; __m128i v[1024];} Branchtab615[6]; +static int Init = 0; + +/* State info for instance of Viterbi decoder */ +struct v615 { + metric_t metrics1; /* path metric buffer 1 */ + metric_t metrics2; /* path metric buffer 2 */ + void *dp; /* Pointer to current decision */ + metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ + void *decisions; /* Beginning of decisions for block */ +}; + +/* Initialize Viterbi decoder for start of new frame */ +int init_viterbi615_sse2(void *p,int starting_state){ + struct v615 *vp = p; + int i; + + if(p == NULL) + return -1; + for(i=0;i<16384;i++) + vp->metrics1.s[i] = (SHRT_MIN+5000); + + vp->old_metrics = &vp->metrics1; + vp->new_metrics = &vp->metrics2; + vp->dp = vp->decisions; + vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */ + return 0; +} + +/* Create a new instance of a Viterbi decoder */ +void *create_viterbi615_sse2(int len){ + void *p; + struct v615 *vp; + + if(!Init){ + int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF }; + set_viterbi615_polynomial_sse2(polys); + } + + /* Ordinary malloc() only returns 8-byte alignment, we need 16 */ + if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v615))) + return NULL; + + vp = (struct v615 *)p; + if((p = malloc((len+14)*sizeof(decision_t))) == NULL){ + free(vp); + return NULL; + } + vp->decisions = (decision_t *)p; + init_viterbi615_sse2(vp,0); + return vp; +} + +void set_viterbi615_polynomial_sse2(int polys[6]){ + int state; + int i; + + for(state=0;state < 8192;state++){ + for(i=0;i<6;i++) + Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0; + } + Init++; +} + +/* Viterbi chainback */ +int chainback_viterbi615_sse2( + void *p, + unsigned char *data, /* Decoded output data */ + unsigned int nbits, /* Number of data bits */ + unsigned int endstate){ /* Terminal encoder state */ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->decisions; + + endstate %= 16384; + + /* The store into data[] only needs to be done every 8 bits. + * But this avoids a conditional branch, and the writes will + * combine in the cache anyway + */ + d += 14; /* Look past tail */ + while(nbits-- != 0){ + int k; + + k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1; + endstate = (k << 13) | (endstate >> 1); + data[nbits>>3] = endstate >> 6; + } + return 0; +} + +/* Delete instance of a Viterbi decoder */ +void delete_viterbi615_sse2(void *p){ + struct v615 *vp = p; + + if(vp != NULL){ + free(vp->decisions); + free(vp); + } +} + + +int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits){ + struct v615 *vp = p; + decision_t *d = (decision_t *)vp->dp; + + while(nbits--){ + __m128i sym0v,sym1v,sym2v,sym3v,sym4v,sym5v; + void *tmp; + int i; + + /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */ + sym0v = _mm_set1_epi16(syms[0]); + sym1v = _mm_set1_epi16(syms[1]); + sym2v = _mm_set1_epi16(syms[2]); + sym3v = _mm_set1_epi16(syms[3]); + sym4v = _mm_set1_epi16(syms[4]); + sym5v = _mm_set1_epi16(syms[5]); + syms += 6; + + /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */ + for(i=0;i<1024;i++){ + __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1; + + /* Form branch metrics + * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255, + * the XOR operations constitute conditional negation. + * metric and m_metric (-metric) are in the range 0-1530 + */ + m0 = _mm_add_epi16(_mm_xor_si128(Branchtab615[0].v[i],sym0v),_mm_xor_si128(Branchtab615[1].v[i],sym1v)); + m1 = _mm_add_epi16(_mm_xor_si128(Branchtab615[2].v[i],sym2v),_mm_xor_si128(Branchtab615[3].v[i],sym3v)); + m2 = _mm_add_epi16(_mm_xor_si128(Branchtab615[4].v[i],sym4v),_mm_xor_si128(Branchtab615[5].v[i],sym5v)); + metric = _mm_add_epi16(m0,_mm_add_epi16(m1,m2)); + m_metric = _mm_sub_epi16(_mm_set1_epi16(1530),metric); + + /* Add branch metrics to path metrics */ + m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric); + m3 = _mm_adds_epi16(vp->old_metrics->v[1024+i],metric); + m1 = _mm_adds_epi16(vp->old_metrics->v[1024+i],m_metric); + m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric); + + /* Compare and select */ + survivor0 = _mm_min_epi16(m0,m1); + survivor1 = _mm_min_epi16(m2,m3); + decision0 = _mm_cmpeq_epi16(survivor0,m1); + decision1 = _mm_cmpeq_epi16(survivor1,m3); + + /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */ + d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128()))); + + /* Store surviving metrics */ + vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1); + vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1); + } + /* See if we need to renormalize + * Max metric spread for this code with 0-90 branch metrics is 405 + */ + if(vp->new_metrics->s[0] >= SHRT_MAX-12750){ + int i,adjust; + __m128i adjustv; + union { __m128i v; signed short w[8]; } t; + + /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */ + adjustv = vp->new_metrics->v[0]; + for(i=1;i<2048;i++) + adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]); + + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4)); + adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2)); + t.v = adjustv; + adjust = t.w[0] - SHRT_MIN; + adjustv = _mm_set1_epi16(adjust); + + /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX + * This is okay since it can't overflow anyway + */ + for(i=0;i<2048;i++) + vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv); + } + d++; + /* Swap pointers to old and new metrics */ + tmp = vp->old_metrics; + vp->old_metrics = vp->new_metrics; + vp->new_metrics = tmp; + } + vp->dp = d; + return 0; +} + + diff --git a/libfec/vtest27.c b/libfec/vtest27.c new file mode 100644 index 0000000..7256483 --- /dev/null +++ b/libfec/vtest27.c @@ -0,0 +1,184 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./2.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10000,errcnt,framebits=2048; + long long int tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*2*(MAXBYTES+6)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi27(framebits)) == NULL){ + printf("create_viterbi27 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), + badframes,tr+1,(double)badframes/(tr+1)); + else + printf("\n"); + + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi27(vp,0); + + /* Decode block */ + update_viterbi27_blk(vp,symbols,framebits); + + /* Do Viterbi chainback */ + chainback_viterbi27(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +} diff --git a/libfec/vtest29.c b/libfec/vtest29.c new file mode 100644 index 0000000..8471b54 --- /dev/null +++ b/libfec/vtest29.c @@ -0,0 +1,185 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./2.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10000,errcnt,framebits=2048; + long long tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*2*(MAXBYTES+8)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi29(framebits)) == NULL){ + printf("create_viterbi29 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), + badframes,tr+1,(double)badframes/(tr+1)); + else + printf("\n"); + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi29(vp,0); + + /* Decode block */ + update_viterbi29_blk(vp,symbols,framebits); + + /* Do Viterbi chainback */ + chainback_viterbi29(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +} + + diff --git a/libfec/vtest39.c b/libfec/vtest39.c new file mode 100644 index 0000000..76723b2 --- /dev/null +++ b/libfec/vtest39.c @@ -0,0 +1,186 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./3.) +#define MAXBYTES 10000 + +double Gain = 32.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10000,errcnt,framebits=2048; + long long tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*3*(MAXBYTES+8)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi39(framebits)) == NULL){ + printf("create_viterbi39 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n", + tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials), + badframes,tr+1,(double)badframes/(tr+1)); + else + printf("\n"); + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi39(vp,0); + + /* Decode block */ + update_viterbi39_blk(vp,symbols,framebits); + + /* Do Viterbi chainback */ + chainback_viterbi39(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +} + + diff --git a/libfec/vtest615.c b/libfec/vtest615.c new file mode 100644 index 0000000..4bd8c4f --- /dev/null +++ b/libfec/vtest615.c @@ -0,0 +1,191 @@ +/* Test viterbi decoder speeds */ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_GETOPT_H +#include +#endif +#include "fec.h" + +#if HAVE_GETOPT_LONG +struct option Options[] = { + {"frame-length",1,NULL,'l'}, + {"frame-count",1,NULL,'n'}, + {"ebn0",1,NULL,'e'}, + {"gain",1,NULL,'g'}, + {"verbose",0,NULL,'v'}, + {"force-altivec",0,NULL,'a'}, + {"force-port",0,NULL,'p'}, + {"force-mmx",0,NULL,'m'}, + {"force-sse",0,NULL,'s'}, + {"force-sse2",0,NULL,'t'}, + {NULL}, +}; +#endif + +#define RATE (1./6.) +#define MAXBYTES 10000 +#define OFFSET (127.5) +#define CLIP 255 + +double Gain = 24.0; +int Verbose = 0; + +int main(int argc,char *argv[]){ + int i,d,tr; + int sr=0,trials = 10,errcnt,framebits=2048; + int tot_errs=0; + unsigned char bits[MAXBYTES]; + unsigned char data[MAXBYTES]; + unsigned char xordata[MAXBYTES]; + unsigned char symbols[8*6*(MAXBYTES+14)]; + void *vp; + extern char *optarg; + struct rusage start,finish; + double extime; + double gain,esn0,ebn0; + time_t t; + int badframes=0; + + time(&t); + srandom(t); + ebn0 = -100; +#if HAVE_GETOPT_LONG + while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){ +#else + while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){ +#endif + switch(d){ + case 'a': + Cpu_mode = ALTIVEC; + break; + case 'p': + Cpu_mode = PORT; + break; + case 'm': + Cpu_mode = MMX; + break; + case 's': + Cpu_mode = SSE; + break; + case 't': + Cpu_mode = SSE2; + break; + case 'l': + framebits = atoi(optarg); + break; + case 'n': + trials = atoi(optarg); + break; + case 'e': + ebn0 = atof(optarg); + break; + case 'g': + Gain = atof(optarg); + break; + case 'v': + Verbose++; + break; + } + } + if(framebits > 8*MAXBYTES){ + fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8); + framebits = MAXBYTES*8; + } + if((vp = create_viterbi615(framebits)) == NULL){ + printf("create_viterbi615 failed\n"); + exit(1); + } + if(ebn0 != -100){ + esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */ + /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing + * only half the noise power, and the sqrt() converts power to + * voltage. + */ + gain = 1./sqrt(0.5/pow(10.,esn0/10.)); + + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + + for(tr=0;tr 1 && errcnt != 0){ + printf("frame %d, %d errors: ",tr,errcnt); + for(i=0;i 1) + printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain); + else if(Verbose == 0) + printf("BER %d/%d (%.3g) FER %d/%d (%.3g)\n", + tot_errs,framebits*(tr+1),tot_errs/((double)framebits*(tr+1)), + badframes,(tr+1),(double)badframes/(tr+1)); + else + printf("\n"); + } else { + /* Do time trials */ + memset(symbols,127,sizeof(symbols)); + printf("Starting time trials\n"); + getrusage(RUSAGE_SELF,&start); + for(tr=0;tr < trials;tr++){ + /* Initialize Viterbi decoder */ + init_viterbi615(vp,0); + + /* Decode block */ + update_viterbi615_blk(vp,symbols,framebits+14); + + /* Do Viterbi chainback */ + chainback_viterbi615(vp,data,framebits,0); + } + getrusage(RUSAGE_SELF,&finish); + extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec); + printf("Execution time for %d %d-bit frames: %.2f sec\n",trials, + framebits,extime); + printf("decoder speed: %g bits/s\n",trials*framebits/extime); + } + exit(0); +}