diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4be403f..34ce3db 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,7 +131,6 @@ find_package(Volk REQUIRED)
 find_package(OggVorbis REQUIRED)
 find_package(PNG REQUIRED)
 find_package(png++ REQUIRED)
-find_package(Fec REQUIRED)
 
 ########################################################################
 # Include or not into the module blocks for debugging
@@ -150,6 +149,45 @@ if(${INCLUDE_DEBUG_BLOCKS})
     endif()
 endif()
 
+########################################################################
+# Search for the libfec if it is already installed in the system
+# If not, install the internal one.
+########################################################################
+find_package(Fec)
+if(NOT FEC_FOUND)
+    message(WARNING "libfec is not installed. The internal libfec will be automatically build and install.")
+    include(ExternalProject)
+    ExternalProject_Add(FEC_EXTERNAL
+        SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/libfec
+        BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libfec
+        CMAKE_ARGS "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+                   "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+                   "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+        INSTALL_COMMAND ""
+    )
+    
+    ExternalProject_Get_Property(FEC_EXTERNAL binary_dir)
+    add_library(fec SHARED IMPORTED)
+
+    set_property(TARGET fec PROPERTY IMPORTED_LOCATION ${install_dir}/libfec.so)
+    
+    add_dependencies(fec FEC_EXTERNAL)
+    set(FEC_LIBRARIES "${binary_dir}/libfec.so")
+    set(FEC_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/libfec")
+    
+    # Install the header and the library in the standard places
+    install(FILES 
+        "${FEC_INCLUDE_DIRS}/fec.h" 
+        DESTINATION "include"
+    )
+    install(FILES
+        ${FEC_LIBRARIES}
+        DESTINATION lib${LIB_SUFFIX}
+    )
+else()
+    add_library(fec INTERFACE)
+endif()
+
 # Search for GNU Radio and its components and versions. Add any
 # components required to the list of GR_REQUIRED_COMPONENTS (in all
 # caps such as FILTER or FFT) and change the version to the minimum
diff --git a/apps/flowgraphs/debug_afsk_transceiver_osmocom.py b/apps/flowgraphs/debug_afsk_transceiver_osmocom.py
new file mode 100755
index 0000000..a9e0892
--- /dev/null
+++ b/apps/flowgraphs/debug_afsk_transceiver_osmocom.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+##################################################
+# GNU Radio Python Flow Graph
+# Title: Debug Afsk Transceiver Osmocom
+# Generated: Mon Jun 13 20:30:12 2016
+##################################################
+
+if __name__ == '__main__':
+    import ctypes
+    import sys
+    if sys.platform.startswith('linux'):
+        try:
+            x11 = ctypes.cdll.LoadLibrary('libX11.so')
+            x11.XInitThreads()
+        except:
+            print "Warning: failed to XInitThreads()"
+
+from PyQt4 import Qt
+from gnuradio import analog
+from gnuradio import audio
+from gnuradio import blocks
+from gnuradio import eng_notation
+from gnuradio import filter
+from gnuradio import gr
+from gnuradio import qtgui
+from gnuradio.eng_option import eng_option
+from gnuradio.filter import firdes
+from gnuradio.qtgui import Range, RangeWidget
+from optparse import OptionParser
+import math
+import numpy
+import satnogs
+import sip
+import sys
+
+
+class debug_afsk_transceiver_osmocom(gr.top_block, Qt.QWidget):
+
+    def __init__(self):
+        gr.top_block.__init__(self, "Debug Afsk Transceiver Osmocom")
+        Qt.QWidget.__init__(self)
+        self.setWindowTitle("Debug Afsk Transceiver Osmocom")
+        try:
+            self.setWindowIcon(Qt.QIcon.fromTheme('gnuradio-grc'))
+        except:
+            pass
+        self.top_scroll_layout = Qt.QVBoxLayout()
+        self.setLayout(self.top_scroll_layout)
+        self.top_scroll = Qt.QScrollArea()
+        self.top_scroll.setFrameStyle(Qt.QFrame.NoFrame)
+        self.top_scroll_layout.addWidget(self.top_scroll)
+        self.top_scroll.setWidgetResizable(True)
+        self.top_widget = Qt.QWidget()
+        self.top_scroll.setWidget(self.top_widget)
+        self.top_layout = Qt.QVBoxLayout(self.top_widget)
+        self.top_grid_layout = Qt.QGridLayout()
+        self.top_layout.addLayout(self.top_grid_layout)
+
+        self.settings = Qt.QSettings("GNU Radio", "debug_afsk_transceiver_osmocom")
+        self.restoreGeometry(self.settings.value("geometry").toByteArray())
+
+        ##################################################
+        # Variables
+        ##################################################
+        self.samples_per_symbol_tx = samples_per_symbol_tx = 4
+        self.sq_wave = sq_wave = (1.0, ) * samples_per_symbol_tx
+        self.gaussian_taps = gaussian_taps = filter.firdes.gaussian(1.0, samples_per_symbol_tx, 1.0, 4*samples_per_symbol_tx)
+        self.deviation = deviation = 800
+        self.baud_rate = baud_rate = 1200
+        self.tx_frequency = tx_frequency = 145.835e6
+        self.samp_rate_tx = samp_rate_tx = 48e3
+        self.modulation_index = modulation_index = deviation / (baud_rate / 2.0)
+        self.interp_taps = interp_taps = numpy.convolve(numpy.array(gaussian_taps), numpy.array(sq_wave))
+        self.atten = atten = 0.1
+
+        ##################################################
+        # Blocks
+        ##################################################
+        self._atten_range = Range(0, 0.9, 0.01, 0.1, 200)
+        self._atten_win = RangeWidget(self._atten_range, self.set_atten, "Attenuation", "counter_slider", float)
+        self.top_layout.addWidget(self._atten_win)
+        self.satnogs_upsat_fsk_frame_encoder_0 = satnogs.upsat_fsk_frame_encoder([0x33]*8, [0x7A, 0x0E], False, False, False, True, True, "ABCD", 0, "UPSAT", 0, 1024)
+        self.satnogs_udp_msg_source_0 = satnogs.udp_msg_source("127.0.0.1", 16886, 1500)
+        self.satnogs_debug_msg_source_0 = satnogs.debug_msg_source("HELLO"*4, 1, True)
+        self.rational_resampler_xxx_0 = filter.rational_resampler_ccc(
+                interpolation=10,
+                decimation=1,
+                taps=None,
+                fractional_bw=None,
+        )
+        self.qtgui_time_sink_x_0_0_0 = qtgui.time_sink_c(
+        	1024, #size
+        	samp_rate_tx, #samp_rate
+        	"", #name
+        	1 #number of inputs
+        )
+        self.qtgui_time_sink_x_0_0_0.set_update_time(0.10)
+        self.qtgui_time_sink_x_0_0_0.set_y_axis(-1, 1)
+        
+        self.qtgui_time_sink_x_0_0_0.set_y_label("Amplitude", "")
+        
+        self.qtgui_time_sink_x_0_0_0.enable_tags(-1, True)
+        self.qtgui_time_sink_x_0_0_0.set_trigger_mode(qtgui.TRIG_MODE_FREE, qtgui.TRIG_SLOPE_POS, 0.0, 0, 0, "")
+        self.qtgui_time_sink_x_0_0_0.enable_autoscale(False)
+        self.qtgui_time_sink_x_0_0_0.enable_grid(False)
+        self.qtgui_time_sink_x_0_0_0.enable_control_panel(True)
+        
+        if not True:
+          self.qtgui_time_sink_x_0_0_0.disable_legend()
+        
+        labels = ["", "", "", "", "",
+                  "", "", "", "", ""]
+        widths = [1, 1, 1, 1, 1,
+                  1, 1, 1, 1, 1]
+        colors = ["blue", "red", "green", "black", "cyan",
+                  "magenta", "yellow", "dark red", "dark green", "blue"]
+        styles = [1, 1, 1, 1, 1,
+                  1, 1, 1, 1, 1]
+        markers = [2, -1, -1, -1, -1,
+                   -1, -1, -1, -1, -1]
+        alphas = [1.0, 1.0, 1.0, 1.0, 1.0,
+                  1.0, 1.0, 1.0, 1.0, 1.0]
+        
+        for i in xrange(2*1):
+            if len(labels[i]) == 0:
+                if(i % 2 == 0):
+                    self.qtgui_time_sink_x_0_0_0.set_line_label(i, "Re{{Data {0}}}".format(i/2))
+                else:
+                    self.qtgui_time_sink_x_0_0_0.set_line_label(i, "Im{{Data {0}}}".format(i/2))
+            else:
+                self.qtgui_time_sink_x_0_0_0.set_line_label(i, labels[i])
+            self.qtgui_time_sink_x_0_0_0.set_line_width(i, widths[i])
+            self.qtgui_time_sink_x_0_0_0.set_line_color(i, colors[i])
+            self.qtgui_time_sink_x_0_0_0.set_line_style(i, styles[i])
+            self.qtgui_time_sink_x_0_0_0.set_line_marker(i, markers[i])
+            self.qtgui_time_sink_x_0_0_0.set_line_alpha(i, alphas[i])
+        
+        self._qtgui_time_sink_x_0_0_0_win = sip.wrapinstance(self.qtgui_time_sink_x_0_0_0.pyqwidget(), Qt.QWidget)
+        self.top_layout.addWidget(self._qtgui_time_sink_x_0_0_0_win)
+        self.qtgui_time_sink_x_0_0 = qtgui.time_sink_f(
+        	1024, #size
+        	samp_rate_tx, #samp_rate
+        	"", #name
+        	1 #number of inputs
+        )
+        self.qtgui_time_sink_x_0_0.set_update_time(0.10)
+        self.qtgui_time_sink_x_0_0.set_y_axis(-1, 1)
+        
+        self.qtgui_time_sink_x_0_0.set_y_label("Amplitude", "")
+        
+        self.qtgui_time_sink_x_0_0.enable_tags(-1, True)
+        self.qtgui_time_sink_x_0_0.set_trigger_mode(qtgui.TRIG_MODE_FREE, qtgui.TRIG_SLOPE_POS, 0.0, 0, 0, "")
+        self.qtgui_time_sink_x_0_0.enable_autoscale(False)
+        self.qtgui_time_sink_x_0_0.enable_grid(False)
+        self.qtgui_time_sink_x_0_0.enable_control_panel(True)
+        
+        if not True:
+          self.qtgui_time_sink_x_0_0.disable_legend()
+        
+        labels = ["", "", "", "", "",
+                  "", "", "", "", ""]
+        widths = [1, 1, 1, 1, 1,
+                  1, 1, 1, 1, 1]
+        colors = ["blue", "red", "green", "black", "cyan",
+                  "magenta", "yellow", "dark red", "dark green", "blue"]
+        styles = [1, 1, 1, 1, 1,
+                  1, 1, 1, 1, 1]
+        markers = [2, -1, -1, -1, -1,
+                   -1, -1, -1, -1, -1]
+        alphas = [1.0, 1.0, 1.0, 1.0, 1.0,
+                  1.0, 1.0, 1.0, 1.0, 1.0]
+        
+        for i in xrange(1):
+            if len(labels[i]) == 0:
+                self.qtgui_time_sink_x_0_0.set_line_label(i, "Data {0}".format(i))
+            else:
+                self.qtgui_time_sink_x_0_0.set_line_label(i, labels[i])
+            self.qtgui_time_sink_x_0_0.set_line_width(i, widths[i])
+            self.qtgui_time_sink_x_0_0.set_line_color(i, colors[i])
+            self.qtgui_time_sink_x_0_0.set_line_style(i, styles[i])
+            self.qtgui_time_sink_x_0_0.set_line_marker(i, markers[i])
+            self.qtgui_time_sink_x_0_0.set_line_alpha(i, alphas[i])
+        
+        self._qtgui_time_sink_x_0_0_win = sip.wrapinstance(self.qtgui_time_sink_x_0_0.pyqwidget(), Qt.QWidget)
+        self.top_layout.addWidget(self._qtgui_time_sink_x_0_0_win)
+        self.interp_fir_filter_xxx_0 = filter.interp_fir_filter_fff(samples_per_symbol_tx, (interp_taps))
+        self.interp_fir_filter_xxx_0.declare_sample_delay(0)
+        self.blocks_vco_f_0 = blocks.vco_f(48e3, -48e3, 1.0)
+        self.blocks_multiply_const_vxx_0 = blocks.multiply_const_vff((atten, ))
+        self.audio_sink_0_0 = audio.sink(48000, "", True)
+        self.analog_quadrature_demod_cf_0 = analog.quadrature_demod_cf(48e3/(2*math.pi*deviation/8.0))
+        self.analog_frequency_modulator_fc_0 = analog.frequency_modulator_fc((math.pi*modulation_index) / samples_per_symbol_tx)
+
+        ##################################################
+        # Connections
+        ##################################################
+        self.msg_connect((self.satnogs_debug_msg_source_0, 'msg'), (self.satnogs_upsat_fsk_frame_encoder_0, 'pdu'))    
+        self.msg_connect((self.satnogs_udp_msg_source_0, 'msg'), (self.satnogs_upsat_fsk_frame_encoder_0, 'pdu'))    
+        self.connect((self.analog_frequency_modulator_fc_0, 0), (self.qtgui_time_sink_x_0_0_0, 0))    
+        self.connect((self.analog_frequency_modulator_fc_0, 0), (self.rational_resampler_xxx_0, 0))    
+        self.connect((self.analog_quadrature_demod_cf_0, 0), (self.blocks_multiply_const_vxx_0, 0))    
+        self.connect((self.blocks_multiply_const_vxx_0, 0), (self.audio_sink_0_0, 0))    
+        self.connect((self.blocks_vco_f_0, 0), (self.qtgui_time_sink_x_0_0, 0))    
+        self.connect((self.interp_fir_filter_xxx_0, 0), (self.analog_frequency_modulator_fc_0, 0))    
+        self.connect((self.interp_fir_filter_xxx_0, 0), (self.blocks_vco_f_0, 0))    
+        self.connect((self.rational_resampler_xxx_0, 0), (self.analog_quadrature_demod_cf_0, 0))    
+        self.connect((self.satnogs_upsat_fsk_frame_encoder_0, 0), (self.interp_fir_filter_xxx_0, 0))    
+
+    def closeEvent(self, event):
+        self.settings = Qt.QSettings("GNU Radio", "debug_afsk_transceiver_osmocom")
+        self.settings.setValue("geometry", self.saveGeometry())
+        event.accept()
+
+
+    def get_samples_per_symbol_tx(self):
+        return self.samples_per_symbol_tx
+
+    def set_samples_per_symbol_tx(self, samples_per_symbol_tx):
+        self.samples_per_symbol_tx = samples_per_symbol_tx
+        self.set_gaussian_taps(filter.firdes.gaussian(1.0, self.samples_per_symbol_tx, 1.0, 4*self.samples_per_symbol_tx))
+        self.set_sq_wave((1.0, ) * self.samples_per_symbol_tx)
+        self.analog_frequency_modulator_fc_0.set_sensitivity((math.pi*self.modulation_index) / self.samples_per_symbol_tx)
+
+    def get_sq_wave(self):
+        return self.sq_wave
+
+    def set_sq_wave(self, sq_wave):
+        self.sq_wave = sq_wave
+        self.set_interp_taps(numpy.convolve(numpy.array(self.gaussian_taps), numpy.array(self.sq_wave)))
+
+    def get_gaussian_taps(self):
+        return self.gaussian_taps
+
+    def set_gaussian_taps(self, gaussian_taps):
+        self.gaussian_taps = gaussian_taps
+        self.set_interp_taps(numpy.convolve(numpy.array(self.gaussian_taps), numpy.array(self.sq_wave)))
+
+    def get_deviation(self):
+        return self.deviation
+
+    def set_deviation(self, deviation):
+        self.deviation = deviation
+        self.set_modulation_index(self.deviation / (self.baud_rate / 2.0))
+        self.analog_quadrature_demod_cf_0.set_gain(48e3/(2*math.pi*self.deviation/8.0))
+
+    def get_baud_rate(self):
+        return self.baud_rate
+
+    def set_baud_rate(self, baud_rate):
+        self.baud_rate = baud_rate
+        self.set_modulation_index(self.deviation / (self.baud_rate / 2.0))
+
+    def get_tx_frequency(self):
+        return self.tx_frequency
+
+    def set_tx_frequency(self, tx_frequency):
+        self.tx_frequency = tx_frequency
+
+    def get_samp_rate_tx(self):
+        return self.samp_rate_tx
+
+    def set_samp_rate_tx(self, samp_rate_tx):
+        self.samp_rate_tx = samp_rate_tx
+        self.qtgui_time_sink_x_0_0_0.set_samp_rate(self.samp_rate_tx)
+        self.qtgui_time_sink_x_0_0.set_samp_rate(self.samp_rate_tx)
+
+    def get_modulation_index(self):
+        return self.modulation_index
+
+    def set_modulation_index(self, modulation_index):
+        self.modulation_index = modulation_index
+        self.analog_frequency_modulator_fc_0.set_sensitivity((math.pi*self.modulation_index) / self.samples_per_symbol_tx)
+
+    def get_interp_taps(self):
+        return self.interp_taps
+
+    def set_interp_taps(self, interp_taps):
+        self.interp_taps = interp_taps
+        self.interp_fir_filter_xxx_0.set_taps((self.interp_taps))
+
+    def get_atten(self):
+        return self.atten
+
+    def set_atten(self, atten):
+        self.atten = atten
+        self.blocks_multiply_const_vxx_0.set_k((self.atten, ))
+
+
+def main(top_block_cls=debug_afsk_transceiver_osmocom, options=None):
+
+    from distutils.version import StrictVersion
+    if StrictVersion(Qt.qVersion()) >= StrictVersion("4.5.0"):
+        style = gr.prefs().get_string('qtgui', 'style', 'raster')
+        Qt.QApplication.setGraphicsSystem(style)
+    qapp = Qt.QApplication(sys.argv)
+
+    tb = top_block_cls()
+    tb.start()
+    tb.show()
+
+    def quitting():
+        tb.stop()
+        tb.wait()
+    qapp.connect(qapp, Qt.SIGNAL("aboutToQuit()"), quitting)
+    qapp.exec_()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/apps/flowgraphs/device_args_handler.py b/apps/flowgraphs/device_args_handler.py
new file mode 100644
index 0000000..346a07f
--- /dev/null
+++ b/apps/flowgraphs/device_args_handler.py
@@ -0,0 +1,5 @@
+# this module will be imported in the into your flowgraph
+
+def append_dev_args(device, dev_args):
+    if(len(dev_args) == 0):
+        return 0
diff --git a/apps/flowgraphs/satellites/mpla.ogg b/apps/flowgraphs/satellites/mpla.ogg
new file mode 100644
index 0000000..67b83eb
Binary files /dev/null and b/apps/flowgraphs/satellites/mpla.ogg differ
diff --git a/cmake/Modules/FindFec.cmake b/cmake/Modules/FindFec.cmake
index d1197a8..76b4e42 100644
--- a/cmake/Modules/FindFec.cmake
+++ b/cmake/Modules/FindFec.cmake
@@ -22,4 +22,4 @@ FIND_LIBRARY(
 )
 
 INCLUDE(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(FEC DEFAULT_MSG FEC_LIBRARIES FEC_INCLUDE_DIRS)
\ No newline at end of file
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(FEC DEFAULT_MSG FEC_LIBRARIES FEC_INCLUDE_DIRS)
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 8bf9a01..e2a6153 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -81,6 +81,9 @@ if(NOT satnogs_sources)
 endif(NOT satnogs_sources)
 
 add_library(gnuradio-satnogs SHARED ${satnogs_sources})
+
+add_dependencies(gnuradio-satnogs fec)
+
 target_link_libraries(gnuradio-satnogs
     ${Boost_LIBRARIES}
     ${GNURADIO_ALL_LIBRARIES}
diff --git a/libfec/CMakeLists.txt b/libfec/CMakeLists.txt
new file mode 100644
index 0000000..684a6d0
--- /dev/null
+++ b/libfec/CMakeLists.txt
@@ -0,0 +1,323 @@
+########################################################################
+# Project setup
+########################################################################
+cmake_minimum_required(VERSION 2.8)
+project(libfec ASM C)
+
+option(BUILD_32BIT_ON_64BIT "Build a 32-bit library on a 64-bit system" OFF)
+
+# Select the release build type by default to get optimization flags
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release")
+    message(STATUS "Build type not specified: defaulting to release.")
+endif(NOT CMAKE_BUILD_TYPE)
+set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "")
+
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
+
+if(NOT LIB_INSTALL_DIR)
+    set(LIB_INSTALL_DIR lib)
+endif()
+
+
+########################################################################
+# Version information
+########################################################################
+set(VERSION_INFO_MAJOR  3)
+set(VERSION_INFO_MINOR  0)
+set(VERSION_INFO_PATCH  0)
+
+if(NOT DEFINED VERSION_INFO_EXTRA)
+    set(VERSION_INFO_EXTRA "git")
+endif()
+include(Version)
+
+if(NOT DEFINED VERSION)
+    #set(VERSION "\"${VERSION_INFO_MAJOR}.${VERSION_INFO_MINOR}.${VERSION_INFO_PATCH}\"")
+    set(VERSION "\"${VERSION_INFO}\"")
+endif()
+
+
+########################################################################
+# Compiler specific setup
+########################################################################
+if(BUILD_32BIT_ON_64BIT)
+    set(CMAKE_SYSTEM_PROCESSOR "i386")
+    set(CMAKE_SIZEOF_VOID_P 4)
+    set(CMAKE_C_FLAGS -m32)
+    set(CMAKE_CXX_FLAGS -m32)
+    add_definitions(-m32)
+endif()
+
+if((CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i686|x86|AMD64") AND (CMAKE_SIZEOF_VOID_P EQUAL 4))
+    set(TARGET_ARCH "x86")
+elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") AND (CMAKE_SIZEOF_VOID_P EQUAL 8))
+    set(TARGET_ARCH "x64")
+elseif((CMAKE_SYSTEM_PROCESSOR MATCHES "i386") AND (CMAKE_SIZEOF_VOID_P EQUAL 8) AND (APPLE))
+    # Mac is weird like that.
+    set(TARGET_ARCH "x64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm*")
+    set(TARGET_ARCH "ARM")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64le")
+    set(TARGET_ARCH "ppc64" "ppc64le")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
+    set(TARGET_ARCH "ppc64" "ppc")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
+    set(TARGET_ARCH "ppc")
+endif()
+
+
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_CLANGCC)
+    add_definitions(-Wall)
+    add_definitions(-Wno-unused)
+
+    if(TARGET_ARCH MATCHES "x64")
+         add_definitions(-fPIC)
+         add_definitions(-msse2)
+    elseif(TARGET_ARCH MATCHES "x86")
+         add_definitions(-mmmx)
+         add_definitions(-msse)
+         add_definitions(-msse2)
+    elseif(TARGET_ARCH MATCHES "ppc|ppc64")
+         add_definitions(-fno-common)
+         add_definitions(-faltivec)
+    endif()
+
+endif()
+
+########################################################################
+# Find build dependencies
+########################################################################
+
+# libm
+find_library(M_LIB m REQUIRED)
+
+
+########################################################################
+# config.h
+########################################################################
+
+#add_definitions(-DHAVE_CONFIG_H)
+
+# Checks for includes
+include(CheckIncludeFile)
+check_include_file("getopt.h"           HAVE_GETOPT_H)
+check_include_file("stdio.h"            HAVE_STDIO_H)
+check_include_file("stdlib.h"           HAVE_STDLIB_H)
+check_include_file("memory.h"           HAVE_MEMORY_H)
+check_include_file("string.h"           HAVE_STRING_H)
+
+# Checks for functions
+include(CheckFunctionExists)
+check_function_exists("getopt_long"      HAVE_GETOPT_LONG)
+check_function_exists("memset"           HAVE_MEMSET)
+check_function_exists("memmove"          HAVE_MEMMOVE)
+
+
+########################################################################
+# Setup apps
+########################################################################
+
+if(TARGET_ARCH MATCHES "x64")
+    list(APPEND libfec_sources
+        dotprod_port.c
+        peakval_port.c
+        sumsq.c
+        sumsq_port.c
+        cpu_mode_x86_64.c
+	##asm
+	#sse2bfly27-64.s
+	#sse2bfly29-64.s
+        )
+
+elseif(TARGET_ARCH MATCHES "x86")
+    list(APPEND libfec_sources
+        viterbi27_mmx.c
+        viterbi27_sse.c
+        viterbi27_sse2.c
+        viterbi29_mmx.c
+        viterbi29_sse.c
+        viterbi29_sse2.c
+        viterbi39_sse2.c
+        viterbi39_sse.c
+        viterbi39_mmx.c
+        viterbi615_mmx.c
+        viterbi615_sse.c
+        viterbi615_sse2.c
+        dotprod_mmx.c
+        dotprod_sse2.c
+        #peakval_mmx.c
+        #peakval_sse.c
+        #peakval_sse2.c
+        sumsq.c
+        sumsq_port.c
+        sumsq_sse2.c
+        sumsq_mmx.c
+        cpu_mode_x86.c
+	#asm
+	cpu_features.s
+	dotprod_mmx_assist.s
+	dotprod_sse2_assist.s
+	mmxbfly27.s
+	mmxbfly29.s
+	peak_mmx_assist.s
+	peak_sse2_assist.s
+	peak_sse_assist.s
+	peakval_mmx_assist.s
+	peakval_sse2_assist.s
+	peakval_sse_assist.s
+	sse2bfly27.s
+	sse2bfly29.s
+	ssebfly27.s
+	ssebfly29.s
+	sumsq_mmx_assist.s
+	sumsq_sse2_assist.s
+        )
+
+elseif(TARGET_ARCH MATCHES "ppc|ppc64")
+    list(APPEND libfec_sources
+        viterbi27_av.c
+        viterbi29_av.c
+        viterbi39_av.c
+        viterbi615_av.c
+        encode_rs_av.c
+        dotprod_av.c
+        sumsq_av.c
+        peakval_av.c
+        cpu_mode_ppc.c
+        )
+else()
+    list(APPEND libfec_sources
+        cpu_mode_generic.c
+        )
+
+endif()
+
+
+list(APPEND libfec_sources
+    fec.c
+    sim.c
+    viterbi27.c
+    viterbi27_port.c
+    viterbi29.c
+    viterbi29_port.c
+    viterbi39.c
+    viterbi39_port.c
+    viterbi615.c
+    viterbi615_port.c
+    encode_rs_char.c
+    encode_rs_int.c
+    encode_rs_8.c
+    decode_rs_char.c
+    decode_rs_int.c
+    decode_rs_8.c
+    init_rs_char.c
+    init_rs_int.c
+    encode_rs_ccsds.c
+    decode_rs_ccsds.c
+    dotprod.c
+    dotprod_port.c
+    peakval.c
+    peakval_port.c
+    sumsq.c
+    sumsq_port.c
+    ccsds_tab.c
+    ccsds_tal.c
+)
+
+
+################################################################################
+# Generate pkg-config file
+################################################################################
+foreach(inc ${LIBFEC_INCLUDE_DIR})
+    list(APPEND LIBFEC_PC_CFLAGS "-I${inc}")
+endforeach()
+
+foreach(lib ${LIBFEC_LIBRARY_DIRS})
+    list(APPEND LIBFEC_PC_PRIV_LIBS "-L${lib}")
+endforeach()
+
+set(LIBFEC_PC_PREFIX ${CMAKE_INSTALL_PREFIX})
+set(LIBFEC_PC_EXEC_PREFIX \${prefix})
+set(LIBFEC_PC_LIBDIR \${exec_prefix}/${LIB_INSTALL_DIR})
+set(LIBFEC_PC_INCLUDEDIR \${prefix}/include)
+set(LIBFEC_PC_VERSION ${VERSION})
+set(LIBFEC_PC_LIBS "-lfec")
+
+# Use space-delimiter in the .pc file, rather than CMake's semicolon separator
+string(REPLACE ";" " " LIBFEC_PC_CFLAGS "${LIBFEC_PC_CFLAGS}")
+string(REPLACE ";" " " LIBFEC_PC_LIBS "${LIBFEC_PC_LIBS}")
+
+# Unset these to avoid hard-coded paths in a cross-environment
+if(CMAKE_CROSSCOMPILING)
+    unset(LIBFEC_PC_CFLAGS)
+    unset(LIBFEC_PC_LIBS)
+endif()
+
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/libfec.pc.in
+    ${CMAKE_CURRENT_BINARY_DIR}/libfec.pc
+    @ONLY
+)
+
+install(
+    FILES ${CMAKE_CURRENT_BINARY_DIR}/libfec.pc
+    DESTINATION ${LIB_INSTALL_DIR}/pkgconfig/
+)
+
+
+########################################################################
+# Setup libraries
+########################################################################
+
+# generate ccsds_tab.c
+add_executable(gen_ccsds gen_ccsds.c init_rs_char.c)
+add_custom_command(
+    OUTPUT ${CMAKE_BINARY_DIR}/ccsds_tab.c
+    COMMAND  ${CMAKE_BINARY_DIR}/gen_ccsds > ccsds_tab.c
+    DEPENDS  gen_ccsds
+)
+
+# generate ccsds_tal.c
+add_executable(gen_ccsds_tal gen_ccsds_tal.c)
+add_custom_command(
+    OUTPUT ${CMAKE_BINARY_DIR}/ccsds_tal.c
+    COMMAND  ${CMAKE_BINARY_DIR}/gen_ccsds_tal > ccsds_tal.c
+    DEPENDS  gen_ccsds_tal
+)
+
+# libfec
+add_library(libfec_shared SHARED ${libfec_sources})
+set_target_properties(libfec_shared PROPERTIES OUTPUT_NAME fec)
+target_link_libraries(libfec_shared ${M_LIB})
+
+
+install(TARGETS libfec_shared
+    DESTINATION ${LIB_INSTALL_DIR})
+install(FILES "${PROJECT_SOURCE_DIR}/fec.h"
+    DESTINATION include)
+
+
+########################################################################
+# Create uninstall target
+########################################################################
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake"
+    IMMEDIATE @ONLY)
+
+add_custom_target(uninstall
+    COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
+
+
+########################################################################
+# Print Summary
+########################################################################
+message(STATUS "")
+message(STATUS "##########################################################")
+message(STATUS "## Building for version: ${VERSION}")
+message(STATUS "## Target Architecture:  ${TARGET_ARCH}")
+message(STATUS "## Using install prefix: ${CMAKE_INSTALL_PREFIX}")
+message(STATUS "##########################################################")
+message(STATUS "")
+
diff --git a/libfec/INSTALL b/libfec/INSTALL
new file mode 100644
index 0000000..7c003a2
--- /dev/null
+++ b/libfec/INSTALL
@@ -0,0 +1,51 @@
+INSTALLATION INSTRUCTIONS
+
+CMake-based build:
+
+Works on most platforms. Do
+
+mkdir build
+cd build
+cmake ..
+make
+
+
+If that fails, try the older automake-based build:
+
+./bootstrap
+./configure
+make
+make test (optional)
+make install (as root)
+
+By default, "make install" puts the libfec libraries in
+/usr/local/lib, the include files in /usr/local/include, and the
+manual page in /usr/local/man.
+
+You may have an old version of the GNU assembler that cannot handle
+the relatively new SSE2 mnemonics. Update your version of the GNU
+"binutils" package.
+
+You may obtain the latest binutils package through your normal
+distribution channels or from:
+
+http://sources.redhat.com/binutils/
+
+TESTING THE FEC LIBRARY
+
+After running the ./configure script, optional tests can be built and
+run as follows:
+
+make test
+
+"make test" tests each routine, using the SIMD versions as
+appropriate, verifying correct operation and estimating Viterbi
+decoding speeds. These tests should always succeed unless something is
+broken.
+
+28 Mar 2004
+Phil Karn, karn@ka9q.net
+
+3 Jan 2014
+Matthias P. Braendli, matthias@mpb.li
+
diff --git a/libfec/LICENSE b/libfec/LICENSE
new file mode 100644
index 0000000..5a883d3
--- /dev/null
+++ b/libfec/LICENSE
@@ -0,0 +1,502 @@
+GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+(This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.)
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    {description}
+    Copyright (C) {year} {fullname}
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  {signature of Ty Coon}, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/libfec/README b/libfec/README
new file mode 100644
index 0000000..68d043e
--- /dev/null
+++ b/libfec/README
@@ -0,0 +1,125 @@
+COPYRIGHT
+
+This package is copyright 2006 by Phil Karn, KA9Q. It may be used
+under the terms of the GNU Lesser General Public License (LGPL). See
+the file "lesser.txt" in this package for license details.
+
+It has been modified by Matthias P. Braendli, HB9EGM, so that it
+compiles for x86_64 and for arm.
+
+For installation instructions, please see INSTALL
+
+INTRODUCTION
+
+This package provides a set of functions that implement several
+popular forward error correction (FEC) algorithms and several low-level routines
+useful in modems implemented with digital signal processing (DSP).
+
+The following routines are provided:
+
+1. Viterbi decoders for the following convolutional codes:
+
+r=1/2 k=7 ("Voyager" code, now a widely used industry standard)
+r=1/2 k=9 (Used on the IS-95 CDMA forward link)
+r=1/6 k=15 ("Cassini" code, used by several NASA/JPL deep space missions)
+
+2. Reed-Solomon encoders and decoders for any user-specified code.
+
+3. Optimized encoder and decoder for the CCSDS-standard (255,223)
+Reed-Solomon code, with and without the CCSDS-standard "dual basis"
+symbol representation.
+
+4. Compute dot product between a 16-bit buffer and a set of 16-bit
+coefficients. This is the basic DSP primitive for digital filtering
+and correlation.
+
+4. Compute sum of squares of a buffer of 16-bit signed integers. This is
+useful in DSP for finding the total energy in a signal.
+
+5. Find peak value in a buffer of 16-bit signed integers, useful for
+scaling a signal to prevent overflow.
+
+SIMD SUPPORT
+
+This package automatically makes use of various SIMD (Single
+Instruction stream, Multiple Data stream) instruction sets, when
+available: MMX, SSE and SSE2 on the IA-32 (Intel) architecture, and
+Altivec on the PowerPC G4 and G5 used by Power Macintoshes.
+
+"Altivec" is a Motorola trademark; Apple calls it "Velocity Engine",
+and IBM calls it "VMX". Altivec is roughly comparable to SSE2 on the
+IA-32.
+
+Many of the SIMD versions run more than an order of
+magnitude faster than their portable C versions. The available SIMD
+instruction sets, if any, are determined at run time and the proper
+version of each routine is automatically selected. If no SIMD
+instructions are available, the portable C version is invoked by
+default. On targets other than IA-32 and PPC, only the portable C
+version is built.
+
+The SIMD-assisted versions generally produce the same results as the C
+versions, with a few minor exceptions. The Viterbi decoders in C have
+a very slightly greater Eb/No performance due to their use of 32-bit
+path metrics. On the other hand, the SIMD versions use the
+"saturating" arithmetic available in these instructions to avoid the
+integer wraparounds that can occur in C when argument ranges are not
+properly constrained. This applies primarily to the "dotprod" (dot
+product) function.
+
+The MMX (MultiMedia eXtensions) instruction set was introduced on
+later Pentium CPUs; it is also implemented on the Pentium II and most
+AMD CPUs starting with the K6. SSE (SIMD Streaming Extensions) was
+introduced in the Pentium III; AMD calls it "3D Now! Professional".
+Intel introduced SSE2 on the Pentium 4, and it has been picked up by
+later AMD CPUs. SSE support implies MMX support, while SSE2 support
+implies both SSE and MMX support.
+
+The latest IA-32 SIMD instruction set, SSE3 (also known as "Prescott
+New Instructions") was introduced in early 2004 with the latest
+("Prescott") revision of the Pentium 4. Relatively little was
+introduced with SSE3, and this library currently makes no use of it.
+
+See the various manual pages for details on how to use the library
+routines.
+
+Copyright 2006, Phil Karn, KA9Q
+karn@ka9q.net
+http://www.ka9q.net/
+
+This software may be used under the terms of the GNU Lesser General
+Public License (LGPL); see the file lesser.txt for details.
+
+Revision history:
+Version 1.0 released 29 May 2001
+
+Version 2.0 released 3 Dec 2001:
+Restructured to add support for shared libraries.
+
+Version 2.0.1 released 8 Dec 2001:
+Includes autoconf/configure script
+
+Version 2.0.2 released 4 Feb 2002:
+Add SIMD version override options
+Test for lack of SSE2 mnemonic support in 'as'
+Build only selected version
+
+Version 2.0.3 released 6 Feb 2002:
+Fix to parityb function in parity.h
+
+feclib version 1.0 released November 2003
+Merged SIMD-Viterbi, RS and DSP libraries
+Changed SIMD Viterbi decoder to detect SSE2/SSE/MMX at runtime rather than build time
+
+feclib version 2.0 (unreleased) Mar 2004
+General speedups and cleanups
+Switch from 4 to 8-bit input symbols on all Viterbi decoders
+Support for Altivec on PowerPC
+Support for k=15 r=1/6 Cassini/Mars Pathfinder/Mars Exploration Rover/STEREO code
+Changed license to GNU Lesser General Public License (LGPL)
+
+feclib version 2.1 June 5 2006
+Added error checking, fixed alignment bug in SSE2 versions of Viterbi decoders causing segfaults
+
+feclib version 2.1.1 June 6 2006
+Fix test/benchmark time measurement on Linux
diff --git a/libfec/README.x86-64 b/libfec/README.x86-64
new file mode 100644
index 0000000..bb4450c
--- /dev/null
+++ b/libfec/README.x86-64
@@ -0,0 +1,13 @@
+This library has been modified to compile natively on x86-64.
+
+An attempt has been made to adapt the assembly code, but due to unsolved issues with
+the fact that shared libraries on x86-64 have to be compiled with PIC, this approach is
+not finished.
+
+This code therefore only uses the portable C implementation, which is certainly slower than
+the assembly SSE2 that could ideally be used.
+
+It could be said that we trade performance against the possibility to compile on x86-64.
+
+feb, 2012
+Matthias P. Braendli, HB9EGM
diff --git a/libfec/bootstrap b/libfec/bootstrap
new file mode 100755
index 0000000..2f58d5c
--- /dev/null
+++ b/libfec/bootstrap
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+aclocal && \
+autoheader && \
+autoconf
+
diff --git a/libfec/ccsds.h b/libfec/ccsds.h
new file mode 100644
index 0000000..ae65468
--- /dev/null
+++ b/libfec/ccsds.h
@@ -0,0 +1,5 @@
+typedef unsigned char data_t;
+extern unsigned char Taltab[],Tal1tab[];
+#define NN 255
+#define NROOTS 32
+
diff --git a/libfec/char.h b/libfec/char.h
new file mode 100644
index 0000000..25efd65
--- /dev/null
+++ b/libfec/char.h
@@ -0,0 +1,24 @@
+/* Stuff specific to the 8-bit symbol version of the general purpose RS codecs
+ *
+ * Copyright 2003, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+typedef unsigned char data_t;
+
+#define MODNN(x) modnn(rs,x)
+
+#define MM (rs->mm)
+#define NN (rs->nn)
+#define ALPHA_TO (rs->alpha_to) 
+#define INDEX_OF (rs->index_of)
+#define GENPOLY (rs->genpoly)
+#define NROOTS (rs->nroots)
+#define FCR (rs->fcr)
+#define PRIM (rs->prim)
+#define IPRIM (rs->iprim)
+#define PAD (rs->pad)
+#define A0 (NN)
+
+
+
+
diff --git a/libfec/cmake/Modules/Version.cmake b/libfec/cmake/Modules/Version.cmake
new file mode 100644
index 0000000..e8d5bd5
--- /dev/null
+++ b/libfec/cmake/Modules/Version.cmake
@@ -0,0 +1,115 @@
+# Portions of this file have been borrowed from and/or inspired by
+# the Version.cmake from the rtl-sdr project.
+#   http://sdr.osmocom.org/trac/wiki/rtl-sdr
+#
+# Provides:
+#   ${VERSION_INFO_BASE}     -  Major.Minor.Patch
+#   ${VERSION_INFO}          -  Major.minor.Patch[-git_info]
+#
+# Requires values for:
+#   ${VERSION_INFO_MAJOR}    - Increment on API compatibility changes.
+#   ${VERSION_INFO_MINOR}    - Increment when adding features.
+#   ${VERSION_INFO_PATCH}    - Increment for bug and documentation changes.
+#
+# Optional:
+#   ${VERSION_INFO_EXTRA}    - Set to "git" to append git info. This is
+#                              intended only for non-versioned development
+#                              builds
+#   ${VERSION_INFO_OVERRIDE} - Set to a non-null value to override the
+#                              VERSION_INFO_EXTRA logic. This is intended
+#                              for automated snapshot builds from exported
+#                              trees, to pass in the git revision info.
+#
+if(DEFINED __INCLUDED_TOOLAME-DAB_VERSION_CMAKE)
+    return()
+endif()
+set(__INCLUDED_TOOLAME-DAB_VERSION_CMAKE TRUE)
+
+################################################################################
+# Gather up variables provided by parent script
+################################################################################
+
+if(NOT DEFINED VERSION_INFO_MAJOR)
+    message(FATAL_ERROR "VERSION_INFO_MAJOR is not defined")
+else()            
+    set(VER_MAJ ${VERSION_INFO_MAJOR})
+endif()
+
+if(NOT DEFINED VERSION_INFO_MINOR)
+    message(FATAL_ERROR "VERSION_INFO_MINOR is not defined")
+else()            
+    set(VER_MIN ${VERSION_INFO_MINOR})
+endif()
+
+if(NOT DEFINED VERSION_INFO_PATCH)
+    message(FATAL_ERROR "VERSION_INFO_PATCH is not defined")
+else()            
+    set(VER_PAT ${VERSION_INFO_PATCH})
+endif()
+
+
+################################################################################
+# Craft version number, using git, if needed
+################################################################################
+find_package(Git QUIET)
+
+if(GIT_FOUND)
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-parse --
+        ERROR_QUIET
+        RESULT_VARIABLE NOT_GIT_REPOSITORY
+        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    )
+
+    if(NOT_GIT_REPOSITORY)
+        set(GIT_INFO "-unknown")
+    else()
+        execute_process(
+            COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD --
+            OUTPUT_VARIABLE GIT_REV OUTPUT_STRIP_TRAILING_WHITESPACE
+            WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+        )
+
+        execute_process(
+            COMMAND ${GIT_EXECUTABLE} diff-index --quiet HEAD --
+            RESULT_VARIABLE GIT_DIRTY
+            WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+        )
+
+        if(GIT_DIRTY)
+            set(GIT_INFO "-${GIT_REV}-dirty")
+        else()
+            set(GIT_INFO "-${GIT_REV}")
+        endif()
+    endif()
+
+else()
+    message(WARNING "git missing -- unable to check libfec version.")
+    unset(NOT_GIT_REPOSITORY)
+    unset(GIT_REV)
+    unset(GIT_DIRTY)
+endif()
+
+
+################################################################################
+# Provide 
+################################################################################
+set(VERSION_INFO_BASE "${VER_MAJ}.${VER_MIN}.${VER_PAT}")
+
+# Force the version suffix.  Used for automated export builds.
+if(VERSION_INFO_OVERRIDE)
+    set(VERSION_INFO "${VERSION_INFO_BASE}-${VERSION_INFO_OVERRIDE}")
+
+# Intra-release builds
+elseif("${VERSION_INFO_EXTRA}" STREQUAL "git")
+    set(VERSION_INFO "${VERSION_INFO_BASE}-git${GIT_INFO}")
+
+# Versioned releases
+elseif("${VERSION_INFO_EXTRA}" STREQUAL "")
+    set(VERSION_INFO "${VERSION_INFO_BASE}")
+
+# Invalid 
+else() 
+    message(FATAL_ERROR 
+        "Unexpected definition of VERSION_INFO_EXTRA: ${VERSION_INFO_EXTRA}")
+endif()
diff --git a/libfec/cmake/cmake_uninstall.cmake.in b/libfec/cmake/cmake_uninstall.cmake.in
new file mode 100644
index 0000000..2037e36
--- /dev/null
+++ b/libfec/cmake/cmake_uninstall.cmake.in
@@ -0,0 +1,21 @@
+if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+string(REGEX REPLACE "\n" ";" files "${files}")
+foreach(file ${files})
+  message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
+  if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    exec_program(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+      )
+    if(NOT "${rm_retval}" STREQUAL 0)
+      message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
+    endif(NOT "${rm_retval}" STREQUAL 0)
+  else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
+  endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+endforeach(file)
diff --git a/libfec/config.guess b/libfec/config.guess
new file mode 100644
index 0000000..0f0fe71
--- /dev/null
+++ b/libfec/config.guess
@@ -0,0 +1,1516 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
+#   Inc.
+
+timestamp='2007-03-06'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+# 02110-1301, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+
+# Originally written by Per Bothner <per@bothner.com>.
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted ChangeLog entry.
+#
+# This script attempts to guess a canonical system name similar to
+# config.sub.  If it succeeds, it prints the system name on stdout, and
+# exits with 0.  Otherwise, it exits with 1.
+#
+# The plan is that this can be called by configure scripts if you
+# don't specify an explicit build system type.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION]
+
+Output the configuration name of the system \`$me' is run on.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.guess ($timestamp)
+
+Originally written by Per Bothner.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
+Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+
+trap 'exit 1' 1 2 15
+
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+
+# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
+# use `HOST_CC' if defined, but it is deprecated.
+
+# Portable tmp directory creation inspired by the Autoconf team.
+
+set_cc_for_build='
+trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
+trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
+: ${TMPDIR=/tmp} ;
+ { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
+ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
+dummy=$tmp/dummy ;
+tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
+case $CC_FOR_BUILD,$HOST_CC,$CC in
+ ,,)    echo "int x;" > $dummy.c ;
+	for c in cc gcc c89 c99 ; do
+	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
+	     CC_FOR_BUILD="$c"; break ;
+	  fi ;
+	done ;
+	if test x"$CC_FOR_BUILD" = x ; then
+	  CC_FOR_BUILD=no_compiler_found ;
+	fi
+	;;
+ ,,*)   CC_FOR_BUILD=$CC ;;
+ ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+esac ; set_cc_for_build= ;'
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 1994-08-24)
+if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+# Note: order is significant - the case branches are not exclusive.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    *:NetBSD:*:*)
+	# NetBSD (nbsd) targets should (where applicable) match one or
+	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+	# switched to ELF, *-*-netbsd* would select the old
+	# object file format.  This provides both forward
+	# compatibility and a consistent mechanism for selecting the
+	# object file format.
+	#
+	# Note: NetBSD doesn't particularly care about the vendor
+	# portion of the name.  We always set it to "unknown".
+	sysctl="sysctl -n hw.machine_arch"
+	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+	case "${UNAME_MACHINE_ARCH}" in
+	    armeb) machine=armeb-unknown ;;
+	    arm*) machine=arm-unknown ;;
+	    sh3el) machine=shl-unknown ;;
+	    sh3eb) machine=sh-unknown ;;
+	    sh5el) machine=sh5le-unknown ;;
+	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+	esac
+	# The Operating System including object format, if it has switched
+	# to ELF recently, or will in the future.
+	case "${UNAME_MACHINE_ARCH}" in
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+		eval $set_cc_for_build
+		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+			| grep __ELF__ >/dev/null
+		then
+		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+		    # Return netbsd for either.  FIX?
+		    os=netbsd
+		else
+		    os=netbsdelf
+		fi
+		;;
+	    *)
+	        os=netbsd
+		;;
+	esac
+	# The OS release
+	# Debian GNU/NetBSD machines have a different userland, and
+	# thus, need a distinct triplet. However, they do not need
+	# kernel version information, so it can be replaced with a
+	# suitable tag, in the style of linux-gnu.
+	case "${UNAME_VERSION}" in
+	    Debian*)
+		release='-gnu'
+		;;
+	    *)
+		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+		;;
+	esac
+	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+	# contains redundant information, the shorter form:
+	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+	echo "${machine}-${os}${release}"
+	exit ;;
+    *:OpenBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
+	exit ;;
+    *:ekkoBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
+	exit ;;
+    *:SolidBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
+	exit ;;
+    macppc:MirBSD:*:*)
+	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    *:MirBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    alpha:OSF1:*:*)
+	case $UNAME_RELEASE in
+	*4.0)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+		;;
+	*5.*)
+	        UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		;;
+	esac
+	# According to Compaq, /usr/sbin/psrinfo has been available on
+	# OSF/1 and Tru64 systems produced since 1995.  I hope that
+	# covers most systems running today.  This code pipes the CPU
+	# types through head -n 1, so we only detect the type of CPU 0.
+	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+	case "$ALPHA_CPU_TYPE" in
+	    "EV4 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV4.5 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "LCA4 (21066/21068)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV5 (21164)")
+		UNAME_MACHINE="alphaev5" ;;
+	    "EV5.6 (21164A)")
+		UNAME_MACHINE="alphaev56" ;;
+	    "EV5.6 (21164PC)")
+		UNAME_MACHINE="alphapca56" ;;
+	    "EV5.7 (21164PC)")
+		UNAME_MACHINE="alphapca57" ;;
+	    "EV6 (21264)")
+		UNAME_MACHINE="alphaev6" ;;
+	    "EV6.7 (21264A)")
+		UNAME_MACHINE="alphaev67" ;;
+	    "EV6.8CB (21264C)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8AL (21264B)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8CX (21264D)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.9A (21264/EV69A)")
+		UNAME_MACHINE="alphaev69" ;;
+	    "EV7 (21364)")
+		UNAME_MACHINE="alphaev7" ;;
+	    "EV7.9 (21364A)")
+		UNAME_MACHINE="alphaev79" ;;
+	esac
+	# A Pn.n version is a patched version.
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	exit ;;
+    Alpha\ *:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# Should we change UNAME_MACHINE based on the output of uname instead
+	# of the specific Alpha model?
+	echo alpha-pc-interix
+	exit ;;
+    21064:Windows_NT:50:3)
+	echo alpha-dec-winnt3.5
+	exit ;;
+    Amiga*:UNIX_System_V:4.0:*)
+	echo m68k-unknown-sysv4
+	exit ;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-amigaos
+	exit ;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-morphos
+	exit ;;
+    *:OS/390:*:*)
+	echo i370-ibm-openedition
+	exit ;;
+    *:z/VM:*:*)
+	echo s390-ibm-zvmoe
+	exit ;;
+    *:OS400:*:*)
+        echo powerpc-ibm-os400
+	exit ;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	echo arm-acorn-riscix${UNAME_RELEASE}
+	exit ;;
+    arm:riscos:*:*|arm:RISCOS:*:*)
+	echo arm-unknown-riscos
+	exit ;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
+	echo hppa1.1-hitachi-hiuxmpp
+	exit ;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+	if test "`(/bin/universe) 2>/dev/null`" = att ; then
+		echo pyramid-pyramid-sysv3
+	else
+		echo pyramid-pyramid-bsd
+	fi
+	exit ;;
+    NILE*:*:*:dcosx)
+	echo pyramid-pyramid-svr4
+	exit ;;
+    DRS?6000:unix:4.0:6*)
+	echo sparc-icl-nx6
+	exit ;;
+    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
+	case `/usr/bin/uname -p` in
+	    sparc) echo sparc-icl-nx7; exit ;;
+	esac ;;
+    sun4H:SunOS:5.*:*)
+	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    i86pc:SunOS:5.*:*)
+	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:6*:*)
+	# According to config.sub, this is the proper way to canonicalize
+	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
+	# it's likely to be more like Solaris than SunOS4.
+	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit ;;
+    sun3*:SunOS:*:*)
+	echo m68k-sun-sunos${UNAME_RELEASE}
+	exit ;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+	case "`/bin/arch`" in
+	    sun3)
+		echo m68k-sun-sunos${UNAME_RELEASE}
+		;;
+	    sun4)
+		echo sparc-sun-sunos${UNAME_RELEASE}
+		;;
+	esac
+	exit ;;
+    aushp:SunOS:*:*)
+	echo sparc-auspex-sunos${UNAME_RELEASE}
+	exit ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+        exit ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+        echo m68k-milan-mint${UNAME_RELEASE}
+        exit ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+        echo m68k-hades-mint${UNAME_RELEASE}
+        exit ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+        echo m68k-unknown-mint${UNAME_RELEASE}
+        exit ;;
+    m68k:machten:*:*)
+	echo m68k-apple-machten${UNAME_RELEASE}
+	exit ;;
+    powerpc:machten:*:*)
+	echo powerpc-apple-machten${UNAME_RELEASE}
+	exit ;;
+    RISC*:Mach:*:*)
+	echo mips-dec-mach_bsd4.3
+	exit ;;
+    RISC*:ULTRIX:*:*)
+	echo mips-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    VAX*:ULTRIX*:*:*)
+	echo vax-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	echo clipper-intergraph-clix${UNAME_RELEASE}
+	exit ;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c &&
+	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+	  SYSTEM_NAME=`$dummy $dummyarg` &&
+	    { echo "$SYSTEM_NAME"; exit; }
+	echo mips-mips-riscos${UNAME_RELEASE}
+	exit ;;
+    Motorola:PowerMAX_OS:*:*)
+	echo powerpc-motorola-powermax
+	exit ;;
+    Motorola:*:4.3:PL8-*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:Power_UNIX:*:*)
+	echo powerpc-harris-powerunix
+	exit ;;
+    m88k:CX/UX:7*:*)
+	echo m88k-harris-cxux7
+	exit ;;
+    m88k:*:4*:R4*)
+	echo m88k-motorola-sysv4
+	exit ;;
+    m88k:*:3*:R3*)
+	echo m88k-motorola-sysv3
+	exit ;;
+    AViiON:dgux:*:*)
+        # DG/UX returns AViiON for all architectures
+        UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	then
+	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    then
+		echo m88k-dg-dgux${UNAME_RELEASE}
+	    else
+		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+	    fi
+	else
+	    echo i586-dg-dgux${UNAME_RELEASE}
+	fi
+ 	exit ;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	echo m88k-dolphin-sysv3
+	exit ;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	echo m88k-motorola-sysv3
+	exit ;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	echo m88k-tektronix-sysv3
+	exit ;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	echo m68k-tektronix-bsd
+	exit ;;
+    *:IRIX*:*:*)
+	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+	exit ;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
+	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
+	echo i386-ibm-aix
+	exit ;;
+    ia64:AIX:*:*)
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		eval $set_cc_for_build
+		sed 's/^		//' << EOF >$dummy.c
+		#include <sys/systemcfg.h>
+
+		main()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
+		then
+			echo "$SYSTEM_NAME"
+		else
+			echo rs6000-ibm-aix3.2.5
+		fi
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		echo rs6000-ibm-aix3.2.4
+	else
+		echo rs6000-ibm-aix3.2
+	fi
+	exit ;;
+    *:AIX:*:[45])
+	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
+		IBM_ARCH=rs6000
+	else
+		IBM_ARCH=powerpc
+	fi
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:*:*)
+	echo rs6000-ibm-aix
+	exit ;;
+    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+	echo romp-ibm-bsd4.4
+	exit ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+	exit ;;                             # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	echo rs6000-bull-bosx
+	exit ;;
+    DPX/2?00:B.O.S.:*:*)
+	echo m68k-bull-sysv3
+	exit ;;
+    9000/[34]??:4.3bsd:1.*:*)
+	echo m68k-hp-bsd
+	exit ;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	echo m68k-hp-bsd4.4
+	exit ;;
+    9000/[34678]??:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	case "${UNAME_MACHINE}" in
+	    9000/31? )            HP_ARCH=m68000 ;;
+	    9000/[34]?? )         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+		if [ -x /usr/bin/getconf ]; then
+		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+                    case "${sc_cpu_version}" in
+                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+                      532)                      # CPU_PA_RISC2_0
+                        case "${sc_kernel_bits}" in
+                          32) HP_ARCH="hppa2.0n" ;;
+                          64) HP_ARCH="hppa2.0w" ;;
+			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
+                        esac ;;
+                    esac
+		fi
+		if [ "${HP_ARCH}" = "" ]; then
+		    eval $set_cc_for_build
+		    sed 's/^              //' << EOF >$dummy.c
+
+              #define _HPUX_SOURCE
+              #include <stdlib.h>
+              #include <unistd.h>
+
+              int main ()
+              {
+              #if defined(_SC_KERNEL_BITS)
+                  long bits = sysconf(_SC_KERNEL_BITS);
+              #endif
+                  long cpu  = sysconf (_SC_CPU_VERSION);
+
+                  switch (cpu)
+              	{
+              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+              	case CPU_PA_RISC2_0:
+              #if defined(_SC_KERNEL_BITS)
+              	    switch (bits)
+              		{
+              		case 64: puts ("hppa2.0w"); break;
+              		case 32: puts ("hppa2.0n"); break;
+              		default: puts ("hppa2.0"); break;
+              		} break;
+              #else  /* !defined(_SC_KERNEL_BITS) */
+              	    puts ("hppa2.0"); break;
+              #endif
+              	default: puts ("hppa1.0"); break;
+              	}
+                  exit (0);
+              }
+EOF
+		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    test -z "$HP_ARCH" && HP_ARCH=hppa
+		fi ;;
+	esac
+	if [ ${HP_ARCH} = "hppa2.0w" ]
+	then
+	    eval $set_cc_for_build
+
+	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
+	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
+	    # generating 64-bit code.  GNU and HP use different nomenclature:
+	    #
+	    # $ CC_FOR_BUILD=cc ./config.guess
+	    # => hppa2.0w-hp-hpux11.23
+	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
+	    # => hppa64-hp-hpux11.23
+
+	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
+		grep __LP64__ >/dev/null
+	    then
+		HP_ARCH="hppa2.0w"
+	    else
+		HP_ARCH="hppa64"
+	    fi
+	fi
+	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+	exit ;;
+    ia64:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	echo ia64-hp-hpux${HPUX_REV}
+	exit ;;
+    3050*:HI-UX:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
+		{ echo "$SYSTEM_NAME"; exit; }
+	echo unknown-hitachi-hiuxwe2
+	exit ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+	echo hppa1.1-hp-bsd
+	exit ;;
+    9000/8??:4.3bsd:*:*)
+	echo hppa1.0-hp-bsd
+	exit ;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+	echo hppa1.0-hp-mpeix
+	exit ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+	echo hppa1.1-hp-osf
+	exit ;;
+    hp8??:OSF1:*:*)
+	echo hppa1.0-hp-osf
+	exit ;;
+    i*86:OSF1:*:*)
+	if [ -x /usr/sbin/sysversion ] ; then
+	    echo ${UNAME_MACHINE}-unknown-osf1mk
+	else
+	    echo ${UNAME_MACHINE}-unknown-osf1
+	fi
+	exit ;;
+    parisc*:Lites*:*:*)
+	echo hppa1.1-hp-lites
+	exit ;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	echo c1-convex-bsd
+        exit ;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+        exit ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	echo c34-convex-bsd
+        exit ;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	echo c38-convex-bsd
+        exit ;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	echo c4-convex-bsd
+        exit ;;
+    CRAY*Y-MP:*:*:*)
+	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*[A-Z]90:*:*:*)
+	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+	      -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*TS:*:*:*)
+	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*T3E:*:*:*)
+	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*SV1:*:*:*)
+	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    *:UNICOS/mp:*:*)
+	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+        exit ;;
+    5000:UNIX_System_V:4.*:*)
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
+	exit ;;
+    sparc*:BSD/OS:*:*)
+	echo sparc-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:BSD/OS:*:*)
+	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:FreeBSD:*:*)
+	case ${UNAME_MACHINE} in
+	    pc98)
+		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	    amd64)
+		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	    *)
+		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	esac
+	exit ;;
+    i*:CYGWIN*:*)
+	echo ${UNAME_MACHINE}-pc-cygwin
+	exit ;;
+    *:MINGW*:*)
+	echo ${UNAME_MACHINE}-pc-mingw32
+	exit ;;
+    i*:windows32*:*)
+    	# uname -m includes "-pc" on this system.
+    	echo ${UNAME_MACHINE}-mingw32
+	exit ;;
+    i*:PW*:*)
+	echo ${UNAME_MACHINE}-pc-pw32
+	exit ;;
+    *:Interix*:[3456]*)
+    	case ${UNAME_MACHINE} in
+	    x86) 
+		echo i586-pc-interix${UNAME_RELEASE}
+		exit ;;
+	    EM64T | authenticamd)
+		echo x86_64-unknown-interix${UNAME_RELEASE}
+		exit ;;
+	esac ;;
+    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
+	echo i${UNAME_MACHINE}-pc-mks
+	exit ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+	# UNAME_MACHINE based on the output of uname instead of i386?
+	echo i586-pc-interix
+	exit ;;
+    i*:UWIN*:*)
+	echo ${UNAME_MACHINE}-pc-uwin
+	exit ;;
+    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
+	echo x86_64-unknown-cygwin
+	exit ;;
+    p*:CYGWIN*:*)
+	echo powerpcle-unknown-cygwin
+	exit ;;
+    prep*:SunOS:5.*:*)
+	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    *:GNU:*:*)
+	# the GNU system
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	exit ;;
+    *:GNU/*:*:*)
+	# other systems with GNU libc and userland
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+	exit ;;
+    i*86:Minix:*:*)
+	echo ${UNAME_MACHINE}-pc-minix
+	exit ;;
+    arm*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    avr32*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    cris:Linux:*:*)
+	echo cris-axis-linux-gnu
+	exit ;;
+    crisv32:Linux:*:*)
+	echo crisv32-axis-linux-gnu
+	exit ;;
+    frv:Linux:*:*)
+    	echo frv-unknown-linux-gnu
+	exit ;;
+    ia64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m32r*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    m68*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    mips:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips
+	#undef mipsel
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mipsel
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^CPU/{
+		s: ::g
+		p
+	    }'`"
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	;;
+    mips64:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips64
+	#undef mips64el
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mips64el
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips64
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^CPU/{
+		s: ::g
+		p
+	    }'`"
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	;;
+    or32:Linux:*:*)
+	echo or32-unknown-linux-gnu
+	exit ;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-gnu
+	exit ;;
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+        esac
+	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	exit ;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+	# Look for CPU level
+	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
+	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
+	  *)    echo hppa-unknown-linux-gnu ;;
+	esac
+	exit ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-gnu
+	exit ;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+	echo ${UNAME_MACHINE}-ibm-linux
+	exit ;;
+    sh64*:Linux:*:*)
+    	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    sh*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    vax:Linux:*:*)
+	echo ${UNAME_MACHINE}-dec-linux-gnu
+	exit ;;
+    x86_64:Linux:*:*)
+	echo x86_64-unknown-linux-gnu
+	exit ;;
+    xtensa:Linux:*:*)
+    	echo xtensa-unknown-linux-gnu
+	exit ;;
+    i*86:Linux:*:*)
+	# The BFD linker knows what the default object file format is, so
+	# first see if it will tell us. cd to the root directory to prevent
+	# problems with other programs or directories called `ld' in the path.
+	# Set LC_ALL=C to ensure ld outputs messages in English.
+	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
+			 | sed -ne '/supported targets:/!d
+				    s/[ 	][ 	]*/ /g
+				    s/.*supported targets: *//
+				    s/ .*//
+				    p'`
+        case "$ld_supported_targets" in
+	  elf32-i386)
+		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
+		;;
+	  a.out-i386-linux)
+		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
+		exit ;;
+	  coff-i386)
+		echo "${UNAME_MACHINE}-pc-linux-gnucoff"
+		exit ;;
+	  "")
+		# Either a pre-BFD a.out linker (linux-gnuoldld) or
+		# one that does not give us useful --help.
+		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
+		exit ;;
+	esac
+	# Determine whether the default compiler is a.out or elf
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <features.h>
+	#ifdef __ELF__
+	# ifdef __GLIBC__
+	#  if __GLIBC__ >= 2
+	LIBC=gnu
+	#  else
+	LIBC=gnulibc1
+	#  endif
+	# else
+	LIBC=gnulibc1
+	# endif
+	#else
+	#if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+	LIBC=gnu
+	#else
+	LIBC=gnuaout
+	#endif
+	#endif
+	#ifdef __dietlibc__
+	LIBC=dietlibc
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^LIBC/{
+		s: ::g
+		p
+	    }'`"
+	test x"${LIBC}" != x && {
+		echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
+		exit
+	}
+	test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
+	;;
+    i*86:DYNIX/ptx:4*:*)
+	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+	# earlier versions are messed up and put the nodename in both
+	# sysname and nodename.
+	echo i386-sequent-sysv4
+	exit ;;
+    i*86:UNIX_SV:4.2MP:2.*)
+        # Unixware is an offshoot of SVR4, but it has its own version
+        # number series starting with 2...
+        # I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+        # Use sysv4.2uw... so that sysv4* matches it.
+	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
+	exit ;;
+    i*86:OS/2:*:*)
+	# If we were able to find `uname', then EMX Unix compatibility
+	# is probably installed.
+	echo ${UNAME_MACHINE}-pc-os2-emx
+	exit ;;
+    i*86:XTS-300:*:STOP)
+	echo ${UNAME_MACHINE}-unknown-stop
+	exit ;;
+    i*86:atheos:*:*)
+	echo ${UNAME_MACHINE}-unknown-atheos
+	exit ;;
+    i*86:syllable:*:*)
+	echo ${UNAME_MACHINE}-pc-syllable
+	exit ;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+	echo i386-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    i*86:*DOS:*:*)
+	echo ${UNAME_MACHINE}-pc-msdosdjgpp
+	exit ;;
+    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
+	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
+	else
+		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
+	fi
+	exit ;;
+    i*86:*:5:[678]*)
+    	# UnixWare 7.x, OpenUNIX and OpenServer 6.
+	case `/bin/uname -X | grep "^Machine"` in
+	    *486*)	     UNAME_MACHINE=i486 ;;
+	    *Pentium)	     UNAME_MACHINE=i586 ;;
+	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+	esac
+	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	exit ;;
+    i*86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
+	else
+		echo ${UNAME_MACHINE}-pc-sysv32
+	fi
+	exit ;;
+    pc:*:*:*)
+	# Left here for compatibility:
+        # uname -m prints for DJGPP always 'pc', but it prints nothing about
+        # the processor, so we play safe by assuming i386.
+	echo i386-pc-msdosdjgpp
+        exit ;;
+    Intel:Mach:3*:*)
+	echo i386-pc-mach3
+	exit ;;
+    paragon:*:*:*)
+	echo i860-intel-osf1
+	exit ;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+	fi
+	exit ;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	echo m68010-convergent-sysv
+	exit ;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+	echo m68k-convergent-sysv
+	exit ;;
+    M680?0:D-NIX:5.3:*)
+	echo m68k-diab-dnix
+	exit ;;
+    M68*:*:R3V[5678]*:*)
+	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+          && { echo i486-ncr-sysv4; exit; } ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
+	echo m68k-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    mc68030:UNIX_System_V:4.*:*)
+	echo m68k-atari-sysv4
+	exit ;;
+    TSUNAMI:LynxOS:2.*:*)
+	echo sparc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    rs6000:LynxOS:2.*:*)
+	echo rs6000-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+	echo powerpc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    SM[BE]S:UNIX_SV:*:*)
+	echo mips-dde-sysv${UNAME_RELEASE}
+	exit ;;
+    RM*:ReliantUNIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    RM*:SINIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		echo ${UNAME_MACHINE}-sni-sysv4
+	else
+		echo ns32k-sni-sysv
+	fi
+	exit ;;
+    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+                      # says <Richard.M.Bartel@ccMail.Census.GOV>
+        echo i586-unisys-sysv4
+        exit ;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@openmarket.com>.
+	# How about differentiating between stratus architectures? -djm
+	echo hppa1.1-stratus-sysv4
+	exit ;;
+    *:*:*:FTX*)
+	# From seanf@swdc.stratus.com.
+	echo i860-stratus-sysv4
+	exit ;;
+    i*86:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo ${UNAME_MACHINE}-stratus-vos
+	exit ;;
+    *:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo hppa1.1-stratus-vos
+	exit ;;
+    mc68*:A/UX:*:*)
+	echo m68k-apple-aux${UNAME_RELEASE}
+	exit ;;
+    news*:NEWS-OS:6*:*)
+	echo mips-sony-newsos6
+	exit ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if [ -d /usr/nec ]; then
+	        echo mips-nec-sysv${UNAME_RELEASE}
+	else
+	        echo mips-unknown-sysv${UNAME_RELEASE}
+	fi
+        exit ;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	echo powerpc-be-beos
+	exit ;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	echo powerpc-apple-beos
+	exit ;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	echo i586-pc-beos
+	exit ;;
+    SX-4:SUPER-UX:*:*)
+	echo sx4-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-5:SUPER-UX:*:*)
+	echo sx5-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-6:SUPER-UX:*:*)
+	echo sx6-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-7:SUPER-UX:*:*)
+	echo sx7-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8:SUPER-UX:*:*)
+	echo sx8-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8R:SUPER-UX:*:*)
+	echo sx8r-nec-superux${UNAME_RELEASE}
+	exit ;;
+    Power*:Rhapsody:*:*)
+	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Rhapsody:*:*)
+	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Darwin:*:*)
+	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
+	case $UNAME_PROCESSOR in
+	    unknown) UNAME_PROCESSOR=powerpc ;;
+	esac
+	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
+	exit ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	UNAME_PROCESSOR=`uname -p`
+	if test "$UNAME_PROCESSOR" = "x86"; then
+		UNAME_PROCESSOR=i386
+		UNAME_MACHINE=pc
+	fi
+	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
+	exit ;;
+    *:QNX:*:4*)
+	echo i386-pc-qnx
+	exit ;;
+    NSE-?:NONSTOP_KERNEL:*:*)
+	echo nse-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSR-?:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    *:NonStop-UX:*:*)
+	echo mips-compaq-nonstopux
+	exit ;;
+    BS2000:POSIX*:*:*)
+	echo bs2000-siemens-sysv
+	exit ;;
+    DS/*:UNIX_System_V:*:*)
+	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
+	exit ;;
+    *:Plan9:*:*)
+	# "uname -m" is not consistent, so use $cputype instead. 386
+	# is converted to i386 for consistency with other x86
+	# operating systems.
+	if test "$cputype" = "386"; then
+	    UNAME_MACHINE=i386
+	else
+	    UNAME_MACHINE="$cputype"
+	fi
+	echo ${UNAME_MACHINE}-unknown-plan9
+	exit ;;
+    *:TOPS-10:*:*)
+	echo pdp10-unknown-tops10
+	exit ;;
+    *:TENEX:*:*)
+	echo pdp10-unknown-tenex
+	exit ;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+	echo pdp10-dec-tops20
+	exit ;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+	echo pdp10-xkl-tops20
+	exit ;;
+    *:TOPS-20:*:*)
+	echo pdp10-unknown-tops20
+	exit ;;
+    *:ITS:*:*)
+	echo pdp10-unknown-its
+	exit ;;
+    SEI:*:*:SEIUX)
+        echo mips-sei-seiux${UNAME_RELEASE}
+	exit ;;
+    *:DragonFly:*:*)
+	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit ;;
+    *:*VMS:*:*)
+    	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	case "${UNAME_MACHINE}" in
+	    A*) echo alpha-dec-vms ; exit ;;
+	    I*) echo ia64-dec-vms ; exit ;;
+	    V*) echo vax-dec-vms ; exit ;;
+	esac ;;
+    *:XENIX:*:SysV)
+	echo i386-pc-xenix
+	exit ;;
+    i*86:skyos:*:*)
+	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
+	exit ;;
+    i*86:rdos:*:*)
+	echo ${UNAME_MACHINE}-pc-rdos
+	exit ;;
+esac
+
+#echo '(No uname command or uname output not recognized.)' 1>&2
+#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
+
+eval $set_cc_for_build
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+          "4"
+#else
+	  ""
+#endif
+         ); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix\n"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+    struct utsname un;
+
+    uname(&un);
+
+    if (strncmp(un.version, "V2", 2) == 0) {
+	printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+	printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+# if !defined (ultrix)
+#  include <sys/param.h>
+#  if defined (BSD)
+#   if BSD == 43
+      printf ("vax-dec-bsd4.3\n"); exit (0);
+#   else
+#    if BSD == 199006
+      printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#    else
+      printf ("vax-dec-bsd\n"); exit (0);
+#    endif
+#   endif
+#  else
+    printf ("vax-dec-bsd\n"); exit (0);
+#  endif
+# else
+    printf ("vax-dec-ultrix\n"); exit (0);
+# endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
+	{ echo "$SYSTEM_NAME"; exit; }
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+	echo c1-convex-bsd
+	exit ;;
+    c2*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit ;;
+    c34*)
+	echo c34-convex-bsd
+	exit ;;
+    c38*)
+	echo c38-convex-bsd
+	exit ;;
+    c4*)
+	echo c4-convex-bsd
+	exit ;;
+    esac
+fi
+
+cat >&2 <<EOF
+$0: unable to guess system type
+
+This script, last modified $timestamp, has failed to recognize
+the operating system you are using. It is advised that you
+download the most up to date version of the config scripts from
+
+  http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.guess
+and
+  http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.sub
+
+If the version you run ($0) is already up to date, please
+send the following data and any information you think might be
+pertinent to <config-patches@gnu.org> in order to provide the needed
+information to handle your system.
+
+config.guess timestamp = $timestamp
+
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+
+UNAME_MACHINE = ${UNAME_MACHINE}
+UNAME_RELEASE = ${UNAME_RELEASE}
+UNAME_SYSTEM  = ${UNAME_SYSTEM}
+UNAME_VERSION = ${UNAME_VERSION}
+EOF
+
+exit 1
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/libfec/config.sub b/libfec/config.sub
new file mode 100755
index 0000000..a06a480
--- /dev/null
+++ b/libfec/config.sub
@@ -0,0 +1,1362 @@
+#! /bin/sh
+# Configuration validation subroutine script.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001
+#   Free Software Foundation, Inc.
+
+timestamp='2001-04-20'
+
+# This file is (in principle) common to ALL GNU software.
+# The presence of a machine in this file suggests that SOME GNU software
+# can handle that machine.  It does not imply ALL GNU software can.
+#
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Please send patches to <config-patches@gnu.org>.
+#
+# Configuration subroutine to validate and canonicalize a configuration type.
+# Supply the specified configuration type as an argument.
+# If it is invalid, we print an error message on stderr and exit with code 1.
+# Otherwise, we print the canonical config type on stdout and succeed.
+
+# This file is supposed to be the same for all GNU packages
+# and recognize all the CPU types, system types and aliases
+# that are meaningful with *any* GNU software.
+# Each package is responsible for reporting which valid configurations
+# it does not support.  The user should be able to distinguish
+# a failure to support a valid configuration from a meaningless
+# configuration.
+
+# The goal of this file is to map all the various variations of a given
+# machine specification into a single specification in the form:
+#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
+# or in some cases, the newer four-part form:
+#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
+# It is wrong to echo any other type of specification.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION] CPU-MFR-OPSYS
+       $0 [OPTION] ALIAS
+
+Canonicalize a configuration name.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.sub ($timestamp)
+
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001
+Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit 0 ;;
+    --version | -v )
+       echo "$version" ; exit 0 ;;
+    --help | --h* | -h )
+       echo "$usage"; exit 0 ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help"
+       exit 1 ;;
+
+    *local*)
+       # First pass through any local machine types.
+       echo $1
+       exit 0;;
+
+    * )
+       break ;;
+  esac
+done
+
+case $# in
+ 0) echo "$me: missing argument$help" >&2
+    exit 1;;
+ 1) ;;
+ *) echo "$me: too many arguments$help" >&2
+    exit 1;;
+esac
+
+# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
+# Here we must recognize all the valid KERNEL-OS combinations.
+maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
+case $maybe_os in
+  nto-qnx* | linux-gnu* | storm-chaos* | os2-emx*)
+    os=-$maybe_os
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
+    ;;
+  *)
+    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
+    if [ $basic_machine != $1 ]
+    then os=`echo $1 | sed 's/.*-/-/'`
+    else os=; fi
+    ;;
+esac
+
+### Let's recognize common machines as not being operating systems so
+### that things like config.sub decstation-3100 work.  We also
+### recognize some manufacturers as not being operating systems, so we
+### can provide default operating systems below.
+case $os in
+	-sun*os*)
+		# Prevent following clause from handling this invalid input.
+		;;
+	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
+	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
+	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
+	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
+	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
+	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
+	-apple | -axis)
+		os=
+		basic_machine=$1
+		;;
+	-sim | -cisco | -oki | -wec | -winbond)
+		os=
+		basic_machine=$1
+		;;
+	-scout)
+		;;
+	-wrs)
+		os=-vxworks
+		basic_machine=$1
+		;;
+	-hiux*)
+		os=-hiuxwe2
+		;;
+	-sco5)
+		os=-sco3.2v5
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco4)
+		os=-sco3.2v4
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2.[4-9]*)
+		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2v[4-9]*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco*)
+		os=-sco3.2v2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-udk*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-isc)
+		os=-isc2.2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-clix*)
+		basic_machine=clipper-intergraph
+		;;
+	-isc*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-lynx*)
+		os=-lynxos
+		;;
+	-ptx*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
+		;;
+	-windowsnt*)
+		os=`echo $os | sed -e 's/windowsnt/winnt/'`
+		;;
+	-psos*)
+		os=-psos
+		;;
+	-mint | -mint[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+esac
+
+# Decode aliases for certain CPU-COMPANY combinations.
+case $basic_machine in
+	# Recognize the basic CPU types without company name.
+	# Some are omitted here because they have special meanings below.
+	tahoe | i860 | ia64 | m32r | m68k | m68000 | m88k | ns32k | arc \
+	        | arm | arme[lb] | arm[bl]e | armv[2345] | armv[345][lb] | strongarm | xscale \
+		| pyramid | mn10200 | mn10300 | tron | a29k \
+		| 580 | i960 | h8300 \
+		| x86 | ppcbe | mipsbe | mipsle | shbe | shle \
+		| hppa | hppa1.0 | hppa1.1 | hppa2.0 | hppa2.0w | hppa2.0n \
+		| hppa64 \
+		| alpha | alphaev[4-8] | alphaev56 | alphapca5[67] \
+		| alphaev6[78] \
+		| we32k | ns16k | clipper | i370 | sh | sh[34] \
+		| powerpc | powerpcle \
+		| 1750a | dsp16xx | pdp10 | pdp11 \
+		| mips16 | mips64 | mipsel | mips64el \
+		| mips64orion | mips64orionel | mipstx39 | mipstx39el \
+		| mips64vr4300 | mips64vr4300el | mips64vr4100 | mips64vr4100el \
+		| mips64vr5000 | miprs64vr5000el | mcore | s390 | s390x \
+		| sparc | sparclet | sparclite | sparc64 | sparcv9 | sparcv9b \
+		| v850 | c4x \
+		| thumb | d10v | d30v | fr30 | avr | openrisc | tic80 \
+		| pj | pjl | h8500)
+		basic_machine=$basic_machine-unknown
+		;;
+	m6811 | m68hc11 | m6812 | m68hc12)
+		# Motorola 68HC11/12.
+		basic_machine=$basic_machine-unknown
+		os=-none
+		;;
+	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | z8k | v70 | w65)
+		;;
+
+	# We use `pc' rather than `unknown'
+	# because (1) that's what they normally are, and
+	# (2) the word "unknown" tends to confuse beginning users.
+	i*86 | x86_64)
+	  basic_machine=$basic_machine-pc
+	  ;;
+	# Object if more than one company name word.
+	*-*-*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+	# Recognize the basic CPU types with company name.
+	# FIXME: clean up the formatting here.
+	vax-* | tahoe-* | i*86-* | i860-* | ia64-* | m32r-* | m68k-* | m68000-* \
+	      | m88k-* | sparc-* | ns32k-* | fx80-* | arc-* | c[123]* \
+	      | arm-*  | armbe-* | armle-* | armv*-* | strongarm-* | xscale-* \
+	      | mips-* | pyramid-* | tron-* | a29k-* | romp-* | rs6000-* \
+	      | power-* | none-* | 580-* | cray2-* | h8300-* | h8500-* | i960-* \
+	      | xmp-* | ymp-* \
+	      | x86-* | ppcbe-* | mipsbe-* | mipsle-* | shbe-* | shle-* \
+	      | hppa-* | hppa1.0-* | hppa1.1-* | hppa2.0-* | hppa2.0w-* \
+	      | hppa2.0n-* | hppa64-* \
+	      | alpha-* | alphaev[4-8]-* | alphaev56-* | alphapca5[67]-* \
+	      | alphaev6[78]-* \
+	      | we32k-* | cydra-* | ns16k-* | pn-* | np1-* | xps100-* \
+	      | clipper-* | orion-* \
+	      | sparclite-* | pdp10-* | pdp11-* | sh-* | powerpc-* | powerpcle-* \
+	      | sparc64-* | sparcv9-* | sparcv9b-* | sparc86x-* \
+	      | mips16-* | mips64-* | mipsel-* \
+	      | mips64el-* | mips64orion-* | mips64orionel-* \
+	      | mips64vr4100-* | mips64vr4100el-* | mips64vr4300-* | mips64vr4300el-* \
+	      | mipstx39-* | mipstx39el-* | mcore-* \
+	      | f30[01]-* | f700-* | s390-* | s390x-* | sv1-* | t3e-* \
+	      | [cjt]90-* \
+	      | m88110-* | m680[01234]0-* | m683?2-* | m68360-* | z8k-* | d10v-* \
+	      | thumb-* | v850-* | d30v-* | tic30-* | tic80-* | c30-* | fr30-* \
+	      | bs2000-* | tic54x-* | c54x-* | x86_64-* | pj-* | pjl-*)
+		;;
+	# Recognize the various machine names and aliases which stand
+	# for a CPU type and a company and sometimes even an OS.
+	386bsd)
+		basic_machine=i386-unknown
+		os=-bsd
+		;;
+	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
+		basic_machine=m68000-att
+		;;
+	3b*)
+		basic_machine=we32k-att
+		;;
+	a29khif)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	adobe68k)
+		basic_machine=m68010-adobe
+		os=-scout
+		;;
+	alliant | fx80)
+		basic_machine=fx80-alliant
+		;;
+	altos | altos3068)
+		basic_machine=m68k-altos
+		;;
+	am29k)
+		basic_machine=a29k-none
+		os=-bsd
+		;;
+	amdahl)
+		basic_machine=580-amdahl
+		os=-sysv
+		;;
+	amiga | amiga-*)
+		basic_machine=m68k-unknown
+		;;
+	amigaos | amigados)
+		basic_machine=m68k-unknown
+		os=-amigaos
+		;;
+	amigaunix | amix)
+		basic_machine=m68k-unknown
+		os=-sysv4
+		;;
+	apollo68)
+		basic_machine=m68k-apollo
+		os=-sysv
+		;;
+	apollo68bsd)
+		basic_machine=m68k-apollo
+		os=-bsd
+		;;
+	aux)
+		basic_machine=m68k-apple
+		os=-aux
+		;;
+	balance)
+		basic_machine=ns32k-sequent
+		os=-dynix
+		;;
+	convex-c1)
+		basic_machine=c1-convex
+		os=-bsd
+		;;
+	convex-c2)
+		basic_machine=c2-convex
+		os=-bsd
+		;;
+	convex-c32)
+		basic_machine=c32-convex
+		os=-bsd
+		;;
+	convex-c34)
+		basic_machine=c34-convex
+		os=-bsd
+		;;
+	convex-c38)
+		basic_machine=c38-convex
+		os=-bsd
+		;;
+	cray | ymp)
+		basic_machine=ymp-cray
+		os=-unicos
+		;;
+	cray2)
+		basic_machine=cray2-cray
+		os=-unicos
+		;;
+	[cjt]90)
+		basic_machine=${basic_machine}-cray
+		os=-unicos
+		;;
+	crds | unos)
+		basic_machine=m68k-crds
+		;;
+	cris | cris-* | etrax*)
+		basic_machine=cris-axis
+		;;
+	da30 | da30-*)
+		basic_machine=m68k-da30
+		;;
+	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
+		basic_machine=mips-dec
+		;;
+	delta | 3300 | motorola-3300 | motorola-delta \
+	      | 3300-motorola | delta-motorola)
+		basic_machine=m68k-motorola
+		;;
+	delta88)
+		basic_machine=m88k-motorola
+		os=-sysv3
+		;;
+	dpx20 | dpx20-*)
+		basic_machine=rs6000-bull
+		os=-bosx
+		;;
+	dpx2* | dpx2*-bull)
+		basic_machine=m68k-bull
+		os=-sysv3
+		;;
+	ebmon29k)
+		basic_machine=a29k-amd
+		os=-ebmon
+		;;
+	elxsi)
+		basic_machine=elxsi-elxsi
+		os=-bsd
+		;;
+	encore | umax | mmax)
+		basic_machine=ns32k-encore
+		;;
+	es1800 | OSE68k | ose68k | ose | OSE)
+		basic_machine=m68k-ericsson
+		os=-ose
+		;;
+	fx2800)
+		basic_machine=i860-alliant
+		;;
+	genix)
+		basic_machine=ns32k-ns
+		;;
+	gmicro)
+		basic_machine=tron-gmicro
+		os=-sysv
+		;;
+	go32)
+		basic_machine=i386-pc
+		os=-go32
+		;;
+	h3050r* | hiux*)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	h8300hms)
+		basic_machine=h8300-hitachi
+		os=-hms
+		;;
+	h8300xray)
+		basic_machine=h8300-hitachi
+		os=-xray
+		;;
+	h8500hms)
+		basic_machine=h8500-hitachi
+		os=-hms
+		;;
+	harris)
+		basic_machine=m88k-harris
+		os=-sysv3
+		;;
+	hp300-*)
+		basic_machine=m68k-hp
+		;;
+	hp300bsd)
+		basic_machine=m68k-hp
+		os=-bsd
+		;;
+	hp300hpux)
+		basic_machine=m68k-hp
+		os=-hpux
+		;;
+	hp3k9[0-9][0-9] | hp9[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k2[0-9][0-9] | hp9k31[0-9])
+		basic_machine=m68000-hp
+		;;
+	hp9k3[2-9][0-9])
+		basic_machine=m68k-hp
+		;;
+	hp9k6[0-9][0-9] | hp6[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k7[0-79][0-9] | hp7[0-79][0-9])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k78[0-9] | hp78[0-9])
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][13679] | hp8[0-9][13679])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][0-9] | hp8[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hppa-next)
+		os=-nextstep3
+		;;
+	hppaosf)
+		basic_machine=hppa1.1-hp
+		os=-osf
+		;;
+	hppro)
+		basic_machine=hppa1.1-hp
+		os=-proelf
+		;;
+	i370-ibm* | ibm*)
+		basic_machine=i370-ibm
+		;;
+# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
+	i*86v32)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv32
+		;;
+	i*86v4*)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv4
+		;;
+	i*86v)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv
+		;;
+	i*86sol2)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-solaris2
+		;;
+	i386mach)
+		basic_machine=i386-mach
+		os=-mach
+		;;
+	i386-vsta | vsta)
+		basic_machine=i386-unknown
+		os=-vsta
+		;;
+	iris | iris4d)
+		basic_machine=mips-sgi
+		case $os in
+		    -irix*)
+			;;
+		    *)
+			os=-irix4
+			;;
+		esac
+		;;
+	isi68 | isi)
+		basic_machine=m68k-isi
+		os=-sysv
+		;;
+	m88k-omron*)
+		basic_machine=m88k-omron
+		;;
+	magnum | m3230)
+		basic_machine=mips-mips
+		os=-sysv
+		;;
+	merlin)
+		basic_machine=ns32k-utek
+		os=-sysv
+		;;
+	mingw32)
+		basic_machine=i386-pc
+		os=-mingw32
+		;;
+	miniframe)
+		basic_machine=m68000-convergent
+		;;
+	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+	mipsel*-linux*)
+		basic_machine=mipsel-unknown
+		os=-linux-gnu
+		;;
+	mips*-linux*)
+		basic_machine=mips-unknown
+		os=-linux-gnu
+		;;
+	mips3*-*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
+		;;
+	mips3*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
+		;;
+	mmix*)
+		basic_machine=mmix-knuth
+		os=-mmixware
+		;;
+	monitor)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	msdos)
+		basic_machine=i386-pc
+		os=-msdos
+		;;
+	mvs)
+		basic_machine=i370-ibm
+		os=-mvs
+		;;
+	ncr3000)
+		basic_machine=i486-ncr
+		os=-sysv4
+		;;
+	netbsd386)
+		basic_machine=i386-unknown
+		os=-netbsd
+		;;
+	netwinder)
+		basic_machine=armv4l-rebel
+		os=-linux
+		;;
+	news | news700 | news800 | news900)
+		basic_machine=m68k-sony
+		os=-newsos
+		;;
+	news1000)
+		basic_machine=m68030-sony
+		os=-newsos
+		;;
+	news-3600 | risc-news)
+		basic_machine=mips-sony
+		os=-newsos
+		;;
+	necv70)
+		basic_machine=v70-nec
+		os=-sysv
+		;;
+	next | m*-next )
+		basic_machine=m68k-next
+		case $os in
+		    -nextstep* )
+			;;
+		    -ns2*)
+		      os=-nextstep2
+			;;
+		    *)
+		      os=-nextstep3
+			;;
+		esac
+		;;
+	nh3000)
+		basic_machine=m68k-harris
+		os=-cxux
+		;;
+	nh[45]000)
+		basic_machine=m88k-harris
+		os=-cxux
+		;;
+	nindy960)
+		basic_machine=i960-intel
+		os=-nindy
+		;;
+	mon960)
+		basic_machine=i960-intel
+		os=-mon960
+		;;
+	nonstopux)
+		basic_machine=mips-compaq
+		os=-nonstopux
+		;;
+	np1)
+		basic_machine=np1-gould
+		;;
+	nsr-tandem)
+		basic_machine=nsr-tandem
+		;;
+	op50n-* | op60c-*)
+		basic_machine=hppa1.1-oki
+		os=-proelf
+		;;
+	OSE68000 | ose68000)
+		basic_machine=m68000-ericsson
+		os=-ose
+		;;
+	os68k)
+		basic_machine=m68k-none
+		os=-os68k
+		;;
+	pa-hitachi)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	paragon)
+		basic_machine=i860-intel
+		os=-osf
+		;;
+	pbd)
+		basic_machine=sparc-tti
+		;;
+	pbb)
+		basic_machine=m68k-tti
+		;;
+        pc532 | pc532-*)
+		basic_machine=ns32k-pc532
+		;;
+	pentium | p5 | k5 | k6 | nexgen)
+		basic_machine=i586-pc
+		;;
+	pentiumpro | p6 | 6x86 | athlon)
+		basic_machine=i686-pc
+		;;
+	pentiumii | pentium2)
+		basic_machine=i686-pc
+		;;
+	pentium-* | p5-* | k5-* | k6-* | nexgen-*)
+		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumpro-* | p6-* | 6x86-* | athlon-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumii-* | pentium2-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pn)
+		basic_machine=pn-gould
+		;;
+	power)	basic_machine=power-ibm
+		;;
+	ppc)	basic_machine=powerpc-unknown
+	        ;;
+	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppcle | powerpclittle | ppc-le | powerpc-little)
+		basic_machine=powerpcle-unknown
+	        ;;
+	ppcle-* | powerpclittle-*)
+		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ps2)
+		basic_machine=i386-ibm
+		;;
+	pw32)
+		basic_machine=i586-unknown
+		os=-pw32
+		;;
+	rom68k)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	rm[46]00)
+		basic_machine=mips-siemens
+		;;
+	rtpc | rtpc-*)
+		basic_machine=romp-ibm
+		;;
+	sa29200)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	sequent)
+		basic_machine=i386-sequent
+		;;
+	sh)
+		basic_machine=sh-hitachi
+		os=-hms
+		;;
+	sparclite-wrs)
+		basic_machine=sparclite-wrs
+		os=-vxworks
+		;;
+	sps7)
+		basic_machine=m68k-bull
+		os=-sysv2
+		;;
+	spur)
+		basic_machine=spur-unknown
+		;;
+	st2000)
+		basic_machine=m68k-tandem
+		;;
+	stratus)
+		basic_machine=i860-stratus
+		os=-sysv4
+		;;
+	sun2)
+		basic_machine=m68000-sun
+		;;
+	sun2os3)
+		basic_machine=m68000-sun
+		os=-sunos3
+		;;
+	sun2os4)
+		basic_machine=m68000-sun
+		os=-sunos4
+		;;
+	sun3os3)
+		basic_machine=m68k-sun
+		os=-sunos3
+		;;
+	sun3os4)
+		basic_machine=m68k-sun
+		os=-sunos4
+		;;
+	sun4os3)
+		basic_machine=sparc-sun
+		os=-sunos3
+		;;
+	sun4os4)
+		basic_machine=sparc-sun
+		os=-sunos4
+		;;
+	sun4sol2)
+		basic_machine=sparc-sun
+		os=-solaris2
+		;;
+	sun3 | sun3-*)
+		basic_machine=m68k-sun
+		;;
+	sun4)
+		basic_machine=sparc-sun
+		;;
+	sun386 | sun386i | roadrunner)
+		basic_machine=i386-sun
+		;;
+	sv1)
+		basic_machine=sv1-cray
+		os=-unicos
+		;;
+	symmetry)
+		basic_machine=i386-sequent
+		os=-dynix
+		;;
+	t3e)
+		basic_machine=t3e-cray
+		os=-unicos
+		;;
+	tic54x | c54x*)
+		basic_machine=tic54x-unknown
+		os=-coff
+		;;
+	tx39)
+		basic_machine=mipstx39-unknown
+		;;
+	tx39el)
+		basic_machine=mipstx39el-unknown
+		;;
+	tower | tower-32)
+		basic_machine=m68k-ncr
+		;;
+	udi29k)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	ultra3)
+		basic_machine=a29k-nyu
+		os=-sym1
+		;;
+	v810 | necv810)
+		basic_machine=v810-nec
+		os=-none
+		;;
+	vaxv)
+		basic_machine=vax-dec
+		os=-sysv
+		;;
+	vms)
+		basic_machine=vax-dec
+		os=-vms
+		;;
+	vpp*|vx|vx-*)
+               basic_machine=f301-fujitsu
+               ;;
+	vxworks960)
+		basic_machine=i960-wrs
+		os=-vxworks
+		;;
+	vxworks68)
+		basic_machine=m68k-wrs
+		os=-vxworks
+		;;
+	vxworks29k)
+		basic_machine=a29k-wrs
+		os=-vxworks
+		;;
+	w65*)
+		basic_machine=w65-wdc
+		os=-none
+		;;
+	w89k-*)
+		basic_machine=hppa1.1-winbond
+		os=-proelf
+		;;
+	xmp)
+		basic_machine=xmp-cray
+		os=-unicos
+		;;
+        xps | xps100)
+		basic_machine=xps100-honeywell
+		;;
+	z8k-*-coff)
+		basic_machine=z8k-unknown
+		os=-sim
+		;;
+	none)
+		basic_machine=none-none
+		os=-none
+		;;
+
+# Here we handle the default manufacturer of certain CPU types.  It is in
+# some cases the only manufacturer, in others, it is the most popular.
+	w89k)
+		basic_machine=hppa1.1-winbond
+		;;
+	op50n)
+		basic_machine=hppa1.1-oki
+		;;
+	op60c)
+		basic_machine=hppa1.1-oki
+		;;
+	mips)
+		if [ x$os = x-linux-gnu ]; then
+			basic_machine=mips-unknown
+		else
+			basic_machine=mips-mips
+		fi
+		;;
+	romp)
+		basic_machine=romp-ibm
+		;;
+	rs6000)
+		basic_machine=rs6000-ibm
+		;;
+	vax)
+		basic_machine=vax-dec
+		;;
+	pdp10)
+		# there are many clones, so DEC is not a safe bet
+		basic_machine=pdp10-unknown
+		;;
+	pdp11)
+		basic_machine=pdp11-dec
+		;;
+	we32k)
+		basic_machine=we32k-att
+		;;
+	sh3 | sh4)
+		basic_machine=sh-unknown
+		;;
+	sparc | sparcv9 | sparcv9b)
+		basic_machine=sparc-sun
+		;;
+        cydra)
+		basic_machine=cydra-cydrome
+		;;
+	orion)
+		basic_machine=orion-highlevel
+		;;
+	orion105)
+		basic_machine=clipper-highlevel
+		;;
+	mac | mpw | mac-mpw)
+		basic_machine=m68k-apple
+		;;
+	pmac | pmac-mpw)
+		basic_machine=powerpc-apple
+		;;
+	c4x*)
+		basic_machine=c4x-none
+		os=-coff
+		;;
+	*-unknown)
+		# Make sure to match an already-canonicalized machine name.
+		;;
+	*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+esac
+
+# Here we canonicalize certain aliases for manufacturers.
+case $basic_machine in
+	*-digital*)
+		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
+		;;
+	*-commodore*)
+		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
+		;;
+	*)
+		;;
+esac
+
+# Decode manufacturer-specific aliases for certain operating systems.
+
+if [ x"$os" != x"" ]
+then
+case $os in
+        # First match some system type aliases
+        # that might get confused with valid system types.
+	# -solaris* is a basic system type, with this one exception.
+	-solaris1 | -solaris1.*)
+		os=`echo $os | sed -e 's|solaris1|sunos4|'`
+		;;
+	-solaris)
+		os=-solaris2
+		;;
+	-svr4*)
+		os=-sysv4
+		;;
+	-unixware*)
+		os=-sysv4.2uw
+		;;
+	-gnu/linux*)
+		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
+		;;
+	# First accept the basic system types.
+	# The portable systems comes first.
+	# Each alternative MUST END IN A *, to match a version number.
+	# -sysv* is not here because it comes later, after sysvr4.
+	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
+	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
+	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
+	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
+	      | -aos* \
+	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
+	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
+	      | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \
+	      | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
+	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
+	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
+	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
+	      | -mingw32* | -linux-gnu* | -uxpv* | -beos* | -mpeix* | -udk* \
+	      | -interix* | -uwin* | -rhapsody* | -darwin* | -opened* \
+	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
+	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* | -os2*)
+	# Remember, each alternative MUST END IN *, to match a version number.
+		;;
+	-qnx*)
+		case $basic_machine in
+		    x86-* | i*86-*)
+			;;
+		    *)
+			os=-nto$os
+			;;
+		esac
+		;;
+	-nto*)
+		os=-nto-qnx
+		;;
+	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
+	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* \
+	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
+		;;
+	-mac*)
+		os=`echo $os | sed -e 's|mac|macos|'`
+		;;
+	-linux*)
+		os=`echo $os | sed -e 's|linux|linux-gnu|'`
+		;;
+	-sunos5*)
+		os=`echo $os | sed -e 's|sunos5|solaris2|'`
+		;;
+	-sunos6*)
+		os=`echo $os | sed -e 's|sunos6|solaris3|'`
+		;;
+	-opened*)
+		os=-openedition
+		;;
+	-wince*)
+		os=-wince
+		;;
+	-osfrose*)
+		os=-osfrose
+		;;
+	-osf*)
+		os=-osf
+		;;
+	-utek*)
+		os=-bsd
+		;;
+	-dynix*)
+		os=-bsd
+		;;
+	-acis*)
+		os=-aos
+		;;
+	-386bsd)
+		os=-bsd
+		;;
+	-ctix* | -uts*)
+		os=-sysv
+		;;
+	-ns2 )
+	        os=-nextstep2
+		;;
+	-nsk*)
+		os=-nsk
+		;;
+	# Preserve the version number of sinix5.
+	-sinix5.*)
+		os=`echo $os | sed -e 's|sinix|sysv|'`
+		;;
+	-sinix*)
+		os=-sysv4
+		;;
+	-triton*)
+		os=-sysv3
+		;;
+	-oss*)
+		os=-sysv3
+		;;
+	-svr4)
+		os=-sysv4
+		;;
+	-svr3)
+		os=-sysv3
+		;;
+	-sysvr4)
+		os=-sysv4
+		;;
+	# This must come after -sysvr4.
+	-sysv*)
+		;;
+	-ose*)
+		os=-ose
+		;;
+	-es1800*)
+		os=-ose
+		;;
+	-xenix)
+		os=-xenix
+		;;
+        -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+	        os=-mint
+		;;
+	-none)
+		;;
+	*)
+		# Get rid of the `-' at the beginning of $os.
+		os=`echo $os | sed 's/[^-]*-//'`
+		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
+		exit 1
+		;;
+esac
+else
+
+# Here we handle the default operating systems that come with various machines.
+# The value should be what the vendor currently ships out the door with their
+# machine or put another way, the most popular os provided with the machine.
+
+# Note that if you're going to try to match "-MANUFACTURER" here (say,
+# "-sun"), then you have to tell the case statement up towards the top
+# that MANUFACTURER isn't an operating system.  Otherwise, code above
+# will signal an error saying that MANUFACTURER isn't an operating
+# system, and we'll never get to this point.
+
+case $basic_machine in
+	*-acorn)
+		os=-riscix1.2
+		;;
+	arm*-rebel)
+		os=-linux
+		;;
+	arm*-semi)
+		os=-aout
+		;;
+	pdp10-*)
+		os=-tops20
+		;;
+        pdp11-*)
+		os=-none
+		;;
+	*-dec | vax-*)
+		os=-ultrix4.2
+		;;
+	m68*-apollo)
+		os=-domain
+		;;
+	i386-sun)
+		os=-sunos4.0.2
+		;;
+	m68000-sun)
+		os=-sunos3
+		# This also exists in the configure program, but was not the
+		# default.
+		# os=-sunos4
+		;;
+	m68*-cisco)
+		os=-aout
+		;;
+	mips*-cisco)
+		os=-elf
+		;;
+	mips*-*)
+		os=-elf
+		;;
+	*-tti)	# must be before sparc entry or we get the wrong os.
+		os=-sysv3
+		;;
+	sparc-* | *-sun)
+		os=-sunos4.1.1
+		;;
+	*-be)
+		os=-beos
+		;;
+	*-ibm)
+		os=-aix
+		;;
+	*-wec)
+		os=-proelf
+		;;
+	*-winbond)
+		os=-proelf
+		;;
+	*-oki)
+		os=-proelf
+		;;
+	*-hp)
+		os=-hpux
+		;;
+	*-hitachi)
+		os=-hiux
+		;;
+	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
+		os=-sysv
+		;;
+	*-cbm)
+		os=-amigaos
+		;;
+	*-dg)
+		os=-dgux
+		;;
+	*-dolphin)
+		os=-sysv3
+		;;
+	m68k-ccur)
+		os=-rtu
+		;;
+	m88k-omron*)
+		os=-luna
+		;;
+	*-next )
+		os=-nextstep
+		;;
+	*-sequent)
+		os=-ptx
+		;;
+	*-crds)
+		os=-unos
+		;;
+	*-ns)
+		os=-genix
+		;;
+	i370-*)
+		os=-mvs
+		;;
+	*-next)
+		os=-nextstep3
+		;;
+        *-gould)
+		os=-sysv
+		;;
+        *-highlevel)
+		os=-bsd
+		;;
+	*-encore)
+		os=-bsd
+		;;
+        *-sgi)
+		os=-irix
+		;;
+        *-siemens)
+		os=-sysv4
+		;;
+	*-masscomp)
+		os=-rtu
+		;;
+	f30[01]-fujitsu | f700-fujitsu)
+		os=-uxpv
+		;;
+	*-rom68k)
+		os=-coff
+		;;
+	*-*bug)
+		os=-coff
+		;;
+	*-apple)
+		os=-macos
+		;;
+	*-atari*)
+		os=-mint
+		;;
+	*)
+		os=-none
+		;;
+esac
+fi
+
+# Here we handle the case where we know the os, and the CPU type, but not the
+# manufacturer.  We pick the logical manufacturer.
+vendor=unknown
+case $basic_machine in
+	*-unknown)
+		case $os in
+			-riscix*)
+				vendor=acorn
+				;;
+			-sunos*)
+				vendor=sun
+				;;
+			-aix*)
+				vendor=ibm
+				;;
+			-beos*)
+				vendor=be
+				;;
+			-hpux*)
+				vendor=hp
+				;;
+			-mpeix*)
+				vendor=hp
+				;;
+			-hiux*)
+				vendor=hitachi
+				;;
+			-unos*)
+				vendor=crds
+				;;
+			-dgux*)
+				vendor=dg
+				;;
+			-luna*)
+				vendor=omron
+				;;
+			-genix*)
+				vendor=ns
+				;;
+			-mvs* | -opened*)
+				vendor=ibm
+				;;
+			-ptx*)
+				vendor=sequent
+				;;
+			-vxsim* | -vxworks*)
+				vendor=wrs
+				;;
+			-aux*)
+				vendor=apple
+				;;
+			-hms*)
+				vendor=hitachi
+				;;
+			-mpw* | -macos*)
+				vendor=apple
+				;;
+			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+				vendor=atari
+				;;
+		esac
+		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
+		;;
+esac
+
+echo $basic_machine$os
+exit 0
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/libfec/configure.in b/libfec/configure.in
new file mode 100644
index 0000000..10b5380
--- /dev/null
+++ b/libfec/configure.in
@@ -0,0 +1,90 @@
+dnl Process this file with autoconf to produce a configure script.
+AC_INIT(viterbi27.c)
+AC_CONFIG_HEADER(config.h)
+SO_NAME=3
+VERSION=3.0.0
+AC_SUBST(SO_NAME)
+AC_SUBST(VERSION)
+
+dnl Checks for programs.
+AC_PROG_CC
+if test $GCC != "yes"
+then
+	AC_MSG_ERROR([Need GNU C compiler])
+fi
+dnl Checks for libraries.
+AC_CHECK_LIB(c, malloc)
+
+dnl Checks for header files.
+AC_CHECK_HEADERS(getopt.h stdio.h stdlib.h memory.h string.h)
+if test -z "$HAVE_stdio.h"
+then
+	AC_MSG_ERROR([Need stdio.h!])
+fi
+if test -z "$HAVE_stdlib.h"
+then
+	AC_MSG_ERROR([Need stdlib.h!])
+fi
+if test -z "$HAVE_stdlib.h"
+then
+	AC_MSG_ERROR([Need memory.h!])
+fi
+if test -z "$HAVE_string.h"
+then
+	AC_MSG_ERROR([Need string.h])
+fi
+
+AC_CANONICAL_SYSTEM
+case $target_cpu in
+x86_64)
+	ARCH_OPTION="-msse2"
+	MLIBS="dotprod_port.o \
+	peakval_port.o \
+	sumsq.o sumsq_port.o \
+	cpu_mode_x86_64.o"
+	;;
+i386|i486|i586|i686)
+    ARCH_OPTION="-march=$target_cpu"
+	MLIBS="viterbi27_mmx.o mmxbfly27.o viterbi27_sse.o ssebfly27.o viterbi27_sse2.o sse2bfly27.o \
+	viterbi29_mmx.o mmxbfly29.o viterbi29_sse.o ssebfly29.o viterbi29_sse2.o sse2bfly29.o \
+	viterbi39_sse2.o viterbi39_sse.o viterbi39_mmx.o \
+	viterbi615_mmx.o viterbi615_sse.o viterbi615_sse2.o \
+	dotprod_mmx.o dotprod_mmx_assist.o \
+	dotprod_sse2.o dotprod_sse2_assist.o \
+	peakval_mmx.o peakval_mmx_assist.o \
+	peakval_sse.o peakval_sse_assist.o \
+	peakval_sse2.o peakval_sse2_assist.o \
+	sumsq.o sumsq_port.o \
+	sumsq_sse2.o sumsq_sse2_assist.o \
+	sumsq_mmx.o sumsq_mmx_assist.o \
+	cpu_features.o cpu_mode_x86.o"
+	;;
+powerpc*)
+	ARCH_OPTION="-fno-common -faltivec"
+	MLIBS="viterbi27_av.o viterbi29_av.o viterbi39_av.o viterbi615_av.o \
+	encode_rs_av.o \
+	dotprod_av.o sumsq_av.o peakval_av.o cpu_mode_ppc.o"
+	;;
+*)
+	MLIBS="cpu_mode_generic.o"
+esac
+case $target_os in
+darwin*)
+	SH_LIB=libfec.dylib
+	REBIND=""
+	;;
+*)
+	SH_LIB=libfec.so
+	REBIND=ldconfig
+	;;
+esac
+AC_SUBST(SH_LIB)
+AC_SUBST(REBIND)
+AC_SUBST(MLIBS)
+AC_SUBST(ARCH_OPTION)
+
+
+dnl Checks for library functions.
+AC_CHECK_FUNCS(getopt_long memset memmove)
+
+AC_OUTPUT(makefile)
diff --git a/libfec/cpu_features.s b/libfec/cpu_features.s
new file mode 100644
index 0000000..ef4ba4e
--- /dev/null
+++ b/libfec/cpu_features.s
@@ -0,0 +1,15 @@
+.text
+.global cpu_features
+	.type cpu_features,@function
+cpu_features:	
+	pushl %ebx
+	pushl %ecx
+	pushl %edx
+	movl $1,%eax
+	cpuid
+	movl %edx,%eax
+	popl %edx
+	popl %ecx
+	popl %ebx
+	ret
+	
\ No newline at end of file
diff --git a/libfec/cpu_mode_generic.c b/libfec/cpu_mode_generic.c
new file mode 100644
index 0000000..500f995
--- /dev/null
+++ b/libfec/cpu_mode_generic.c
@@ -0,0 +1,13 @@
+/* Determine CPU support for SIMD on Power PC
+ * Copyright 2004 Phil Karn, KA9Q
+ * Copyright 2014 Matthias P. Braendli, HB9EGM
+ */
+#include <stdio.h>
+#include "fec.h"
+
+enum cpu_mode Cpu_mode;
+
+// Use the portable code for this unknown CPU
+void find_cpu_mode(void) {
+  Cpu_mode = PORT;
+}
diff --git a/libfec/cpu_mode_ppc.c b/libfec/cpu_mode_ppc.c
new file mode 100644
index 0000000..0071558
--- /dev/null
+++ b/libfec/cpu_mode_ppc.c
@@ -0,0 +1,40 @@
+/* Determine CPU support for SIMD on Power PC
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include "fec.h"
+#ifdef __VEC__
+#include <sys/sysctl.h>
+#endif
+
+/* Various SIMD instruction set names */
+char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)",
+		   "x86 Streaming SIMD Extensions (SSE)",
+		   "x86 Streaming SIMD Extensions 2 (SSE2)",
+		   "PowerPC G4/G5 Altivec/Velocity Engine"};
+
+enum cpu_mode Cpu_mode;
+
+void find_cpu_mode(void){
+
+  if(Cpu_mode != UNKNOWN)
+    return;
+
+#ifdef __VEC__
+  {
+  /* Ask the OS if we have Altivec support */
+  int selectors[2] = { CTL_HW, HW_VECTORUNIT };
+  int hasVectorUnit = 0;
+  size_t length = sizeof(hasVectorUnit);
+  int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
+  if(0 == error && hasVectorUnit)
+    Cpu_mode = ALTIVEC;
+  else
+    Cpu_mode = PORT;
+  }
+#else
+  Cpu_mode = PORT;
+#endif
+
+  fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]);
+}
diff --git a/libfec/cpu_mode_x86.c b/libfec/cpu_mode_x86.c
new file mode 100644
index 0000000..322018e
--- /dev/null
+++ b/libfec/cpu_mode_x86.c
@@ -0,0 +1,33 @@
+/* Determine CPU support for SIMD
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include "fec.h"
+
+/* Various SIMD instruction set names */
+char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)",
+		   "x86 Streaming SIMD Extensions (SSE)",
+		   "x86 Streaming SIMD Extensions 2 (SSE2)",
+		   "PowerPC G4/G5 Altivec/Velocity Engine"};
+
+enum cpu_mode Cpu_mode;
+
+void find_cpu_mode(void){
+
+  int f;
+  if(Cpu_mode != UNKNOWN)
+    return;
+
+  /* Figure out what kind of CPU we have */
+  f = cpu_features();
+  if(f & (1<<26)){ /* SSE2 is present */
+    Cpu_mode = SSE2;
+  } else if(f & (1<<25)){ /* SSE is present */
+    Cpu_mode = SSE;
+  } else if(f & (1<<23)){ /* MMX is present */
+    Cpu_mode = MMX;
+  } else { /* No SIMD at all */
+    Cpu_mode = PORT;
+  }
+  fprintf(stderr,"SIMD CPU detect: %s\n",Cpu_modes[Cpu_mode]);
+}
diff --git a/libfec/cpu_mode_x86_64.c b/libfec/cpu_mode_x86_64.c
new file mode 100644
index 0000000..758096a
--- /dev/null
+++ b/libfec/cpu_mode_x86_64.c
@@ -0,0 +1,27 @@
+/* Determine CPU support for SIMD
+ * Copyright 2004 Phil Karn, KA9Q
+ *
+ * Modified in 2012 by Matthias P. Braendli, HB9EGM
+ */
+#include <stdio.h>
+#include "fec.h"
+
+/* Various SIMD instruction set names */
+char *Cpu_modes[] = {"Unknown","Portable C","x86 Multi Media Extensions (MMX)",
+		   "x86 Streaming SIMD Extensions (SSE)",
+		   "x86 Streaming SIMD Extensions 2 (SSE2)",
+		   "PowerPC G4/G5 Altivec/Velocity Engine"};
+
+enum cpu_mode Cpu_mode;
+
+void find_cpu_mode(void){
+
+  int f;
+  if(Cpu_mode != UNKNOWN)
+    return;
+
+  /* According to the wikipedia entry x86-64, all x86-64 processors have SSE2 */
+  /* The same assumption is also in other source files ! */
+  Cpu_mode = SSE2;
+  fprintf(stderr,"CPU: x86-64, using portable C implementation\n");
+}
diff --git a/libfec/decode_rs.c b/libfec/decode_rs.c
new file mode 100644
index 0000000..d7f97b3
--- /dev/null
+++ b/libfec/decode_rs.c
@@ -0,0 +1,262 @@
+/* Reed-Solomon decoder
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#include <string.h>
+
+#define NULL ((void *)0)
+#define	min(a,b)	((a) < (b) ? (a) : (b))
+
+#ifdef FIXED
+#include "fixed.h"
+#elif defined(BIGSYM)
+#include "int.h"
+#else
+#include "char.h"
+#endif
+
+int DECODE_RS(
+#ifdef FIXED
+data_t *data, int *eras_pos, int no_eras,int pad){
+#else
+void *p,data_t *data, int *eras_pos, int no_eras){
+  struct rs *rs = (struct rs *)p;
+#endif
+  int deg_lambda, el, deg_omega;
+  int i, j, r,k;
+  data_t u,q,tmp,num1,num2,den,discr_r;
+  data_t lambda[NROOTS+1], s[NROOTS];	/* Err+Eras Locator poly
+					 * and syndrome poly */
+  data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1];
+  data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS];
+  int syn_error, count;
+
+#ifdef FIXED
+  /* Check pad parameter for validity */
+  if(pad < 0 || pad >= NN)
+    return -1;
+#endif
+
+  /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */
+  for(i=0;i<NROOTS;i++)
+    s[i] = data[0];
+
+  for(j=1;j<NN-PAD;j++){
+    for(i=0;i<NROOTS;i++){
+      if(s[i] == 0){
+	s[i] = data[j];
+      } else {
+	s[i] = data[j] ^ ALPHA_TO[MODNN(INDEX_OF[s[i]] + (FCR+i)*PRIM)];
+      }
+    }
+  }
+
+  /* Convert syndromes to index form, checking for nonzero condition */
+  syn_error = 0;
+  for(i=0;i<NROOTS;i++){
+    syn_error |= s[i];
+    s[i] = INDEX_OF[s[i]];
+  }
+
+  if (!syn_error) {
+    /* if syndrome is zero, data[] is a codeword and there are no
+     * errors to correct. So return data[] unmodified
+     */
+    count = 0;
+    goto finish;
+  }
+  memset(&lambda[1],0,NROOTS*sizeof(lambda[0]));
+  lambda[0] = 1;
+
+  if (no_eras > 0) {
+    /* Init lambda to be the erasure locator polynomial */
+    lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))];
+    for (i = 1; i < no_eras; i++) {
+      u = MODNN(PRIM*(NN-1-eras_pos[i]));
+      for (j = i+1; j > 0; j--) {
+	tmp = INDEX_OF[lambda[j - 1]];
+	if(tmp != A0)
+	  lambda[j] ^= ALPHA_TO[MODNN(u + tmp)];
+      }
+    }
+
+#if DEBUG >= 1
+    /* Test code that verifies the erasure locator polynomial just constructed
+       Needed only for decoder debugging. */
+    
+    /* find roots of the erasure location polynomial */
+    for(i=1;i<=no_eras;i++)
+      reg[i] = INDEX_OF[lambda[i]];
+
+    count = 0;
+    for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+      q = 1;
+      for (j = 1; j <= no_eras; j++)
+	if (reg[j] != A0) {
+	  reg[j] = MODNN(reg[j] + j);
+	  q ^= ALPHA_TO[reg[j]];
+	}
+      if (q != 0)
+	continue;
+      /* store root and error location number indices */
+      root[count] = i;
+      loc[count] = k;
+      count++;
+    }
+    if (count != no_eras) {
+      printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras);
+      count = -1;
+      goto finish;
+    }
+#if DEBUG >= 2
+    printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n");
+    for (i = 0; i < count; i++)
+      printf("%d ", loc[i]);
+    printf("\n");
+#endif
+#endif
+  }
+  for(i=0;i<NROOTS+1;i++)
+    b[i] = INDEX_OF[lambda[i]];
+  
+  /*
+   * Begin Berlekamp-Massey algorithm to determine error+erasure
+   * locator polynomial
+   */
+  r = no_eras;
+  el = no_eras;
+  while (++r <= NROOTS) {	/* r is the step number */
+    /* Compute discrepancy at the r-th step in poly-form */
+    discr_r = 0;
+    for (i = 0; i < r; i++){
+      if ((lambda[i] != 0) && (s[r-i-1] != A0)) {
+	discr_r ^= ALPHA_TO[MODNN(INDEX_OF[lambda[i]] + s[r-i-1])];
+      }
+    }
+    discr_r = INDEX_OF[discr_r];	/* Index form */
+    if (discr_r == A0) {
+      /* 2 lines below: B(x) <-- x*B(x) */
+      memmove(&b[1],b,NROOTS*sizeof(b[0]));
+      b[0] = A0;
+    } else {
+      /* 7 lines below: T(x) <-- lambda(x) - discr_r*x*b(x) */
+      t[0] = lambda[0];
+      for (i = 0 ; i < NROOTS; i++) {
+	if(b[i] != A0)
+	  t[i+1] = lambda[i+1] ^ ALPHA_TO[MODNN(discr_r + b[i])];
+	else
+	  t[i+1] = lambda[i+1];
+      }
+      if (2 * el <= r + no_eras - 1) {
+	el = r + no_eras - el;
+	/*
+	 * 2 lines below: B(x) <-- inv(discr_r) *
+	 * lambda(x)
+	 */
+	for (i = 0; i <= NROOTS; i++)
+	  b[i] = (lambda[i] == 0) ? A0 : MODNN(INDEX_OF[lambda[i]] - discr_r + NN);
+      } else {
+	/* 2 lines below: B(x) <-- x*B(x) */
+	memmove(&b[1],b,NROOTS*sizeof(b[0]));
+	b[0] = A0;
+      }
+      memcpy(lambda,t,(NROOTS+1)*sizeof(t[0]));
+    }
+  }
+
+  /* Convert lambda to index form and compute deg(lambda(x)) */
+  deg_lambda = 0;
+  for(i=0;i<NROOTS+1;i++){
+    lambda[i] = INDEX_OF[lambda[i]];
+    if(lambda[i] != A0)
+      deg_lambda = i;
+  }
+  /* Find roots of the error+erasure locator polynomial by Chien search */
+  memcpy(&reg[1],&lambda[1],NROOTS*sizeof(reg[0]));
+  count = 0;		/* Number of roots of lambda(x) */
+  for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+    q = 1; /* lambda[0] is always 0 */
+    for (j = deg_lambda; j > 0; j--){
+      if (reg[j] != A0) {
+	reg[j] = MODNN(reg[j] + j);
+	q ^= ALPHA_TO[reg[j]];
+      }
+    }
+    if (q != 0)
+      continue; /* Not a root */
+    /* store root (index-form) and error location number */
+#if DEBUG>=2
+    printf("count %d root %d loc %d\n",count,i,k);
+#endif
+    root[count] = i;
+    loc[count] = k;
+    /* If we've already found max possible roots,
+     * abort the search to save time
+     */
+    if(++count == deg_lambda)
+      break;
+  }
+  if (deg_lambda != count) {
+    /*
+     * deg(lambda) unequal to number of roots => uncorrectable
+     * error detected
+     */
+    count = -1;
+    goto finish;
+  }
+  /*
+   * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo
+   * x**NROOTS). in index form. Also find deg(omega).
+   */
+  deg_omega = deg_lambda-1;
+  for (i = 0; i <= deg_omega;i++){
+    tmp = 0;
+    for(j=i;j >= 0; j--){
+      if ((s[i - j] != A0) && (lambda[j] != A0))
+	tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])];
+    }
+    omega[i] = INDEX_OF[tmp];
+  }
+
+  /*
+   * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 =
+   * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form
+   */
+  for (j = count-1; j >=0; j--) {
+    num1 = 0;
+    for (i = deg_omega; i >= 0; i--) {
+      if (omega[i] != A0)
+	num1  ^= ALPHA_TO[MODNN(omega[i] + i * root[j])];
+    }
+    num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)];
+    den = 0;
+    
+    /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */
+    for (i = min(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) {
+      if(lambda[i+1] != A0)
+	den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])];
+    }
+#if DEBUG >= 1
+    if (den == 0) {
+      printf("\n ERROR: denominator = 0\n");
+      count = -1;
+      goto finish;
+    }
+#endif
+    /* Apply error to data */
+    if (num1 != 0 && loc[j] >= PAD) {
+      data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])];
+    }
+  }
+ finish:
+  if(eras_pos != NULL){
+    for(i=0;i<count;i++)
+      eras_pos[i] = loc[i];
+  }
+  return count;
+}
diff --git a/libfec/decode_rs.h b/libfec/decode_rs.h
new file mode 100644
index 0000000..c165cf3
--- /dev/null
+++ b/libfec/decode_rs.h
@@ -0,0 +1,298 @@
+/* The guts of the Reed-Solomon decoder, meant to be #included
+ * into a function body with the following typedefs, macros and variables supplied
+ * according to the code parameters:
+
+ * data_t - a typedef for the data symbol
+ * data_t data[] - array of NN data and parity symbols to be corrected in place
+ * retval - an integer lvalue into which the decoder's return code is written
+ * NROOTS - the number of roots in the RS code generator polynomial,
+ *          which is the same as the number of parity symbols in a block.
+            Integer variable or literal.
+ * NN - the total number of symbols in a RS block. Integer variable or literal.
+ * PAD - the number of pad symbols in a block. Integer variable or literal.
+ * ALPHA_TO - The address of an array of NN elements to convert Galois field
+ *            elements in index (log) form to polynomial form. Read only.
+ * INDEX_OF - The address of an array of NN elements to convert Galois field
+ *            elements in polynomial form to index (log) form. Read only.
+ * MODNN - a function to reduce its argument modulo NN. May be inline or a macro.
+ * FCR - An integer literal or variable specifying the first consecutive root of the
+ *       Reed-Solomon generator polynomial. Integer variable or literal.
+ * PRIM - The primitive root of the generator poly. Integer variable or literal.
+ * DEBUG - If set to 1 or more, do various internal consistency checking. Leave this
+ *         undefined for production code
+
+ * The memset(), memmove(), and memcpy() functions are used. The appropriate header
+ * file declaring these functions (usually <string.h>) must be included by the calling
+ * program.
+ */
+
+
+#if !defined(NROOTS)
+#error "NROOTS not defined"
+#endif
+
+#if !defined(NN)
+#error "NN not defined"
+#endif
+
+#if !defined(PAD)
+#error "PAD not defined"
+#endif
+
+#if !defined(ALPHA_TO)
+#error "ALPHA_TO not defined"
+#endif
+
+#if !defined(INDEX_OF)
+#error "INDEX_OF not defined"
+#endif
+
+#if !defined(MODNN)
+#error "MODNN not defined"
+#endif
+
+#if !defined(FCR)
+#error "FCR not defined"
+#endif
+
+#if !defined(PRIM)
+#error "PRIM not defined"
+#endif
+
+#if !defined(NULL)
+#define NULL ((void *)0)
+#endif
+
+#undef MIN
+#define	MIN(a,b)	((a) < (b) ? (a) : (b))
+#undef A0
+#define A0 (NN)
+
+{
+  int deg_lambda, el, deg_omega;
+  int i, j, r,k;
+  data_t u,q,tmp,num1,num2,den,discr_r;
+  data_t lambda[NROOTS+1], s[NROOTS];	/* Err+Eras Locator poly
+					 * and syndrome poly */
+  data_t b[NROOTS+1], t[NROOTS+1], omega[NROOTS+1];
+  data_t root[NROOTS], reg[NROOTS+1], loc[NROOTS];
+  int syn_error, count;
+
+  /* form the syndromes; i.e., evaluate data(x) at roots of g(x) */
+  for(i=0;i<NROOTS;i++)
+    s[i] = data[0];
+
+  for(j=1;j<NN-PAD;j++){
+    for(i=0;i<NROOTS;i++){
+      if(s[i] == 0){
+	s[i] = data[j];
+      } else {
+	s[i] = data[j] ^ ALPHA_TO[MODNN(INDEX_OF[s[i]] + (FCR+i)*PRIM)];
+      }
+    }
+  }
+
+  /* Convert syndromes to index form, checking for nonzero condition */
+  syn_error = 0;
+  for(i=0;i<NROOTS;i++){
+    syn_error |= s[i];
+    s[i] = INDEX_OF[s[i]];
+  }
+
+  if (!syn_error) {
+    /* if syndrome is zero, data[] is a codeword and there are no
+     * errors to correct. So return data[] unmodified
+     */
+    count = 0;
+    goto finish;
+  }
+  memset(&lambda[1],0,NROOTS*sizeof(lambda[0]));
+  lambda[0] = 1;
+
+  if (no_eras > 0) {
+    /* Init lambda to be the erasure locator polynomial */
+    lambda[1] = ALPHA_TO[MODNN(PRIM*(NN-1-eras_pos[0]))];
+    for (i = 1; i < no_eras; i++) {
+      u = MODNN(PRIM*(NN-1-eras_pos[i]));
+      for (j = i+1; j > 0; j--) {
+	tmp = INDEX_OF[lambda[j - 1]];
+	if(tmp != A0)
+	  lambda[j] ^= ALPHA_TO[MODNN(u + tmp)];
+      }
+    }
+
+#if DEBUG >= 1
+    /* Test code that verifies the erasure locator polynomial just constructed
+       Needed only for decoder debugging. */
+    
+    /* find roots of the erasure location polynomial */
+    for(i=1;i<=no_eras;i++)
+      reg[i] = INDEX_OF[lambda[i]];
+
+    count = 0;
+    for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+      q = 1;
+      for (j = 1; j <= no_eras; j++)
+	if (reg[j] != A0) {
+	  reg[j] = MODNN(reg[j] + j);
+	  q ^= ALPHA_TO[reg[j]];
+	}
+      if (q != 0)
+	continue;
+      /* store root and error location number indices */
+      root[count] = i;
+      loc[count] = k;
+      count++;
+    }
+    if (count != no_eras) {
+      printf("count = %d no_eras = %d\n lambda(x) is WRONG\n",count,no_eras);
+      count = -1;
+      goto finish;
+    }
+#if DEBUG >= 2
+    printf("\n Erasure positions as determined by roots of Eras Loc Poly:\n");
+    for (i = 0; i < count; i++)
+      printf("%d ", loc[i]);
+    printf("\n");
+#endif
+#endif
+  }
+  for(i=0;i<NROOTS+1;i++)
+    b[i] = INDEX_OF[lambda[i]];
+  
+  /*
+   * Begin Berlekamp-Massey algorithm to determine error+erasure
+   * locator polynomial
+   */
+  r = no_eras;
+  el = no_eras;
+  while (++r <= NROOTS) {	/* r is the step number */
+    /* Compute discrepancy at the r-th step in poly-form */
+    discr_r = 0;
+    for (i = 0; i < r; i++){
+      if ((lambda[i] != 0) && (s[r-i-1] != A0)) {
+	discr_r ^= ALPHA_TO[MODNN(INDEX_OF[lambda[i]] + s[r-i-1])];
+      }
+    }
+    discr_r = INDEX_OF[discr_r];	/* Index form */
+    if (discr_r == A0) {
+      /* 2 lines below: B(x) <-- x*B(x) */
+      memmove(&b[1],b,NROOTS*sizeof(b[0]));
+      b[0] = A0;
+    } else {
+      /* 7 lines below: T(x) <-- lambda(x) - discr_r*x*b(x) */
+      t[0] = lambda[0];
+      for (i = 0 ; i < NROOTS; i++) {
+	if(b[i] != A0)
+	  t[i+1] = lambda[i+1] ^ ALPHA_TO[MODNN(discr_r + b[i])];
+	else
+	  t[i+1] = lambda[i+1];
+      }
+      if (2 * el <= r + no_eras - 1) {
+	el = r + no_eras - el;
+	/*
+	 * 2 lines below: B(x) <-- inv(discr_r) *
+	 * lambda(x)
+	 */
+	for (i = 0; i <= NROOTS; i++)
+	  b[i] = (lambda[i] == 0) ? A0 : MODNN(INDEX_OF[lambda[i]] - discr_r + NN);
+      } else {
+	/* 2 lines below: B(x) <-- x*B(x) */
+	memmove(&b[1],b,NROOTS*sizeof(b[0]));
+	b[0] = A0;
+      }
+      memcpy(lambda,t,(NROOTS+1)*sizeof(t[0]));
+    }
+  }
+
+  /* Convert lambda to index form and compute deg(lambda(x)) */
+  deg_lambda = 0;
+  for(i=0;i<NROOTS+1;i++){
+    lambda[i] = INDEX_OF[lambda[i]];
+    if(lambda[i] != A0)
+      deg_lambda = i;
+  }
+  /* Find roots of the error+erasure locator polynomial by Chien search */
+  memcpy(&reg[1],&lambda[1],NROOTS*sizeof(reg[0]));
+  count = 0;		/* Number of roots of lambda(x) */
+  for (i = 1,k=IPRIM-1; i <= NN; i++,k = MODNN(k+IPRIM)) {
+    q = 1; /* lambda[0] is always 0 */
+    for (j = deg_lambda; j > 0; j--){
+      if (reg[j] != A0) {
+	reg[j] = MODNN(reg[j] + j);
+	q ^= ALPHA_TO[reg[j]];
+      }
+    }
+    if (q != 0)
+      continue; /* Not a root */
+    /* store root (index-form) and error location number */
+#if DEBUG>=2
+    printf("count %d root %d loc %d\n",count,i,k);
+#endif
+    root[count] = i;
+    loc[count] = k;
+    /* If we've already found max possible roots,
+     * abort the search to save time
+     */
+    if(++count == deg_lambda)
+      break;
+  }
+  if (deg_lambda != count) {
+    /*
+     * deg(lambda) unequal to number of roots => uncorrectable
+     * error detected
+     */
+    count = -1;
+    goto finish;
+  }
+  /*
+   * Compute err+eras evaluator poly omega(x) = s(x)*lambda(x) (modulo
+   * x**NROOTS). in index form. Also find deg(omega).
+   */
+  deg_omega = deg_lambda-1;
+  for (i = 0; i <= deg_omega;i++){
+    tmp = 0;
+    for(j=i;j >= 0; j--){
+      if ((s[i - j] != A0) && (lambda[j] != A0))
+	tmp ^= ALPHA_TO[MODNN(s[i - j] + lambda[j])];
+    }
+    omega[i] = INDEX_OF[tmp];
+  }
+
+  /*
+   * Compute error values in poly-form. num1 = omega(inv(X(l))), num2 =
+   * inv(X(l))**(FCR-1) and den = lambda_pr(inv(X(l))) all in poly-form
+   */
+  for (j = count-1; j >=0; j--) {
+    num1 = 0;
+    for (i = deg_omega; i >= 0; i--) {
+      if (omega[i] != A0)
+	num1  ^= ALPHA_TO[MODNN(omega[i] + i * root[j])];
+    }
+    num2 = ALPHA_TO[MODNN(root[j] * (FCR - 1) + NN)];
+    den = 0;
+    
+    /* lambda[i+1] for i even is the formal derivative lambda_pr of lambda[i] */
+    for (i = MIN(deg_lambda,NROOTS-1) & ~1; i >= 0; i -=2) {
+      if(lambda[i+1] != A0)
+	den ^= ALPHA_TO[MODNN(lambda[i+1] + i * root[j])];
+    }
+#if DEBUG >= 1
+    if (den == 0) {
+      printf("\n ERROR: denominator = 0\n");
+      count = -1;
+      goto finish;
+    }
+#endif
+    /* Apply error to data */
+    if (num1 != 0 && loc[j] >= PAD) {
+      data[loc[j]-PAD] ^= ALPHA_TO[MODNN(INDEX_OF[num1] + INDEX_OF[num2] + NN - INDEX_OF[den])];
+    }
+  }
+ finish:
+  if(eras_pos != NULL){
+    for(i=0;i<count;i++)
+      eras_pos[i] = loc[i];
+  }
+  retval = count;
+}
diff --git a/libfec/decode_rs_8.c b/libfec/decode_rs_8.c
new file mode 100644
index 0000000..995b0d9
--- /dev/null
+++ b/libfec/decode_rs_8.c
@@ -0,0 +1,24 @@
+/* General purpose Reed-Solomon decoder for 8-bit symbols or less
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#include <string.h>
+
+#include "fixed.h"
+
+int decode_rs_8(data_t *data, int *eras_pos, int no_eras, int pad){
+  int retval;
+ 
+  if(pad < 0 || pad > 222){
+    return -1;
+  }
+
+#include "decode_rs.h"
+  
+  return retval;
+}
diff --git a/libfec/decode_rs_ccsds.c b/libfec/decode_rs_ccsds.c
new file mode 100644
index 0000000..0e246b4
--- /dev/null
+++ b/libfec/decode_rs_ccsds.c
@@ -0,0 +1,26 @@
+/* This function wraps around the fixed 8-bit decoder, performing the
+ * basis transformations necessary to meet the CCSDS standard
+ *
+ * Copyright 2002, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include "ccsds.h"
+#include "fec.h"
+
+int decode_rs_ccsds(data_t *data,int *eras_pos,int no_eras,int pad){
+  int i,r;
+  data_t cdata[NN];
+
+  /* Convert data from dual basis to conventional */
+  for(i=0;i<NN-pad;i++)
+    cdata[i] = Tal1tab[data[i]];
+
+  r = decode_rs_8(cdata,eras_pos,no_eras,pad);
+
+  if(r > 0){
+    /* Convert from conventional to dual basis */
+    for(i=0;i<NN-pad;i++)
+      data[i] = Taltab[cdata[i]];
+  }
+  return r;
+}
diff --git a/libfec/decode_rs_char.c b/libfec/decode_rs_char.c
new file mode 100644
index 0000000..7105233
--- /dev/null
+++ b/libfec/decode_rs_char.c
@@ -0,0 +1,22 @@
+/* General purpose Reed-Solomon decoder for 8-bit symbols or less
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#include <string.h>
+
+#include "char.h"
+#include "rs-common.h"
+
+int decode_rs_char(void *p, data_t *data, int *eras_pos, int no_eras){
+  int retval;
+  struct rs *rs = (struct rs *)p;
+ 
+#include "decode_rs.h"
+  
+  return retval;
+}
diff --git a/libfec/decode_rs_int.c b/libfec/decode_rs_int.c
new file mode 100644
index 0000000..1ef1a1f
--- /dev/null
+++ b/libfec/decode_rs_int.c
@@ -0,0 +1,22 @@
+/* General purpose Reed-Solomon decoder
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#include <string.h>
+
+#include "int.h"
+#include "rs-common.h"
+
+int decode_rs_int(void *p, data_t *data, int *eras_pos, int no_eras){
+  int retval;
+  struct rs *rs = (struct rs *)p;
+ 
+#include "decode_rs.h"
+  
+  return retval;
+}
diff --git a/libfec/dotprod.c b/libfec/dotprod.c
new file mode 100644
index 0000000..5fb1da9
--- /dev/null
+++ b/libfec/dotprod.c
@@ -0,0 +1,111 @@
+/* 16-bit signed integer dot product
+ * Switch to appropriate versions
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+void *initdp_port(signed short coeffs[],int len);
+long dotprod_port(void *p,signed short *b);
+void freedp_port(void *p);
+
+#ifdef __i386__
+void *initdp_mmx(signed short coeffs[],int len);
+void *initdp_sse2(signed short coeffs[],int len);
+long dotprod_mmx(void *p,signed short *b);
+long dotprod_sse2(void *p,signed short *b);
+void freedp_mmx(void *p);
+void freedp_sse2(void *p);
+#endif
+
+#ifdef __VEC__
+void *initdp_av(signed short coeffs[],int len);
+long dotprod_av(void *p,signed short *b);
+void freedp_av(void *p);
+#endif
+
+/* Create and return a descriptor for use with the dot product function */
+void *initdp(signed short coeffs[],int len){
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return initdp_port(coeffs,len);
+#ifdef __i386__
+  case MMX:
+  case SSE:
+    return initdp_mmx(coeffs,len);
+  case SSE2:
+    return initdp_sse2(coeffs,len);
+#endif
+
+#ifdef __x86_64__
+  case SSE2:
+    return initdp_port(coeffs,len);
+#endif
+
+#ifdef __VEC__
+  case ALTIVEC:
+    return initdp_av(coeffs,len);
+#endif
+  }
+}
+
+
+/* Free a dot product descriptor created earlier */
+void freedp(void *p){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return freedp_port(p);
+#ifdef __i386__
+  case MMX:
+  case SSE:
+    return freedp_mmx(p);
+  case SSE2:
+    return freedp_sse2(p);
+#endif
+
+#ifdef __x86_64__
+  case SSE2:
+    return freedp_port(p);
+#endif
+
+#ifdef __VEC__
+  case ALTIVEC:
+    return freedp_av(p);
+#endif
+  }
+}
+
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod(void *p,signed short a[]){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return dotprod_port(p,a);
+#ifdef __i386__
+  case MMX:
+  case SSE:
+    return dotprod_mmx(p,a);
+  case SSE2:
+    return dotprod_sse2(p,a);
+#endif
+
+#ifdef __x86_64__
+  case SSE2:
+    return dotprod_port(p,a);
+#endif
+
+#ifdef __VEC__
+  case ALTIVEC:
+    return dotprod_av(p,a);
+#endif
+  }
+}
+
+
diff --git a/libfec/dotprod.h b/libfec/dotprod.h
new file mode 100644
index 0000000..6b62b70
--- /dev/null
+++ b/libfec/dotprod.h
@@ -0,0 +1,15 @@
+/* Internal definitions for dotproduct function */
+
+struct dotprod {
+  int len; /* Number of coefficients */
+
+  /* On a MMX or SSE machine, these hold 4 copies of the coefficients,
+   * preshifted by 0,1,2,3 words to meet all possible input data
+   * alignments (see Intel ap559 on MMX dot products).
+   *
+   * SSE2 is similar, but with 8 words at a time
+   *
+   * On a non-MMX machine, only one copy is present
+   */
+  signed short *coeffs[8];
+};
diff --git a/libfec/dotprod_av.c b/libfec/dotprod_av.c
new file mode 100644
index 0000000..1f70471
--- /dev/null
+++ b/libfec/dotprod_av.c
@@ -0,0 +1,93 @@
+/* 16-bit signed integer dot product
+ * Altivec-assisted version
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+struct dotprod {
+  int len; /* Number of coefficients */
+
+  /* On an Altivec machine, these hold 8 copies of the coefficients,
+   * preshifted by 0,1,..7 words to meet all possible input data
+   */
+  signed short *coeffs[8];
+};
+
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_av(signed short coeffs[],int len){
+  struct dotprod *dp;
+  int i,j;
+
+  if(len == 0)
+    return NULL;
+
+  dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+  dp->len = len;
+
+  /* Make 8 copies of coefficients, one for each data alignment,
+   * each aligned to 16-byte boundary
+   */
+  for(i=0;i<8;i++){
+    dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
+    for(j=0;j<len;j++)
+      dp->coeffs[i][j+i] = coeffs[j];
+  }
+  return (void *)dp;
+}
+
+
+/* Free a dot product descriptor created earlier */
+void freedp_av(void *p){
+  struct dotprod *dp = (struct dotprod *)p;
+  int i;
+
+  for(i=0;i<8;i++)
+    if(dp->coeffs[i] != NULL)
+      free(dp->coeffs[i]);
+  free(dp);
+}
+
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_av(void *p,signed short a[]){
+  struct dotprod *dp = (struct dotprod *)p;
+  int al;
+  vector signed short *ar,*d;
+  vector signed int sums0,sums1,sums2,sums3;
+  union { vector signed int v; signed int w[4];} s;
+  int nblocks;
+    
+  /* round ar down to beginning of 16-byte block containing 0th element of
+   * input buffer. Then set d to one of 8 sets of shifted coefficients
+   */
+  ar = (vector signed short *)((int)a & ~15);
+  al = ((int)a & 15)/sizeof(signed short);
+  d = (vector signed short *)dp->coeffs[al];
+  
+  nblocks = (dp->len+al-1)/8+1;
+  
+  /* Sum into four vectors each holding four 32-bit partial sums */
+  sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
+  while(nblocks >= 4){
+    sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
+    sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
+    sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
+    sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
+    nblocks -= 4;
+  }
+  sums0 = vec_adds(sums0,sums1);
+  sums2 = vec_adds(sums2,sums3);
+  sums0 = vec_adds(sums0,sums2);
+  while(nblocks-- > 0){
+    sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
+  }
+  /* Sum 4 partial sums into final result */
+  s.v = vec_sums(sums0,(vector signed int)(0));
+  
+  return s.w[3];
+}
+
+
diff --git a/libfec/dotprod_mmx.c b/libfec/dotprod_mmx.c
new file mode 100644
index 0000000..c516afe
--- /dev/null
+++ b/libfec/dotprod_mmx.c
@@ -0,0 +1,81 @@
+/* 16-bit signed integer dot product
+ * MMX assisted version; also for SSE
+ *
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+struct dotprod {
+  int len; /* Number of coefficients */
+
+  /* On a MMX or SSE machine, these hold 4 copies of the coefficients,
+   * preshifted by 0,1,2,3 words to meet all possible input data
+   * alignments (see Intel ap559 on MMX dot products).
+   */
+  signed short *coeffs[4];
+};
+long dotprod_mmx_assist(signed short *a,signed short *b,int cnt);
+
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_mmx(signed short coeffs[],int len){
+  struct dotprod *dp;
+  int i,j;
+
+
+  if(len == 0)
+    return NULL;
+
+  dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+  dp->len = len;
+
+  /* Make 4 copies of coefficients, one for each data alignment */
+  for(i=0;i<4;i++){
+    dp->coeffs[i] = (signed short *)calloc(1+(len+i-1)/4,
+					   4*sizeof(signed short));
+    for(j=0;j<len;j++)
+      dp->coeffs[i][j+i] = coeffs[j];
+  }
+  return (void *)dp;
+}
+
+
+/* Free a dot product descriptor created earlier */
+void freedp_mmx(void *p){
+  struct dotprod *dp = (struct dotprod *)p;
+  int i;
+
+  for(i=0;i<4;i++)
+    if(dp->coeffs[i] != NULL)
+      free(dp->coeffs[i]);
+  free(dp);
+}
+
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_mmx(void *p,signed short a[]){
+  struct dotprod *dp = (struct dotprod *)p;
+  int al;
+  signed short *ar;
+      
+  /* Round input data address down to 8 byte boundary
+   * NB: depending on the alignment of a[], memory
+   * before a[] will be accessed. The contents don't matter since they'll
+   * be multiplied by zero coefficients. I can't conceive of any
+   * situation where this could cause a segfault since memory protection
+   * in the x86 machines is done on much larger boundaries
+   */
+  ar = (signed short *)((int)a & ~7);
+  
+  /* Choose one of 4 sets of pre-shifted coefficients. al is both the
+   * index into dp->coeffs[] and the number of 0 words padded onto
+   * that coefficients array for alignment purposes
+   */
+  al = a - ar;
+  
+  /* Call assembler routine to do the work, passing number of 4-word blocks */
+  return dotprod_mmx_assist(ar,dp->coeffs[al],(dp->len+al-1)/4+1);
+}
+
diff --git a/libfec/dotprod_mmx_assist.s b/libfec/dotprod_mmx_assist.s
new file mode 100644
index 0000000..25deffd
--- /dev/null
+++ b/libfec/dotprod_mmx_assist.s
@@ -0,0 +1,83 @@
+# SIMD MMX dot product
+# Equivalent to the following C code:
+# long dotprod(signed short *a,signed short *b,int cnt)
+# {
+#	long sum = 0; 
+#	cnt *= 4; 
+#	while(cnt--)
+#		sum += *a++ + *b++;
+#	return sum;
+# }
+# a and b should also be 64-bit aligned, or speed will suffer greatly
+# Copyright 1999, Phil Karn KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+	
+	.text
+	.global dotprod_mmx_assist
+	.type dotprod_mmx_assist,@function
+dotprod_mmx_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %ecx
+	pushl %ebx
+	movl 8(%ebp),%esi	# a
+	movl 12(%ebp),%edi	# b
+	movl 16(%ebp),%ecx	# cnt
+	pxor %mm0,%mm0		# clear running sum (in two 32-bit halves)
+	
+# MMX dot product loop unrolled 4 times, crunching 16 terms per loop
+	.align 16
+.Loop1:	subl $4,%ecx
+	jl   .Loop1Done
+	
+	movq (%esi),%mm1	# mm1 = a[3],a[2],a[1],a[0]
+ 	pmaddwd (%edi),%mm1	# mm1 = b[3]*a[3]+b[2]*a[2],b[1]*a[1]+b[0]*a[0]
+	paddd %mm1,%mm0
+	
+	movq 8(%esi),%mm1
+	pmaddwd 8(%edi),%mm1
+	paddd %mm1,%mm0
+
+	movq 16(%esi),%mm1
+	pmaddwd 16(%edi),%mm1
+	paddd %mm1,%mm0
+
+	movq 24(%esi),%mm1
+	addl $32,%esi	
+	pmaddwd 24(%edi),%mm1
+	addl $32,%edi	
+	paddd %mm1,%mm0
+
+	jmp .Loop1
+.Loop1Done:
+	
+	addl $4,%ecx	
+	
+# MMX dot product loop, not unrolled, crunching 4 terms per loop
+# This could be redone as Duff's Device on the unrolled loop above
+.Loop2:	subl $1,%ecx
+	jl   .Loop2Done
+	
+	movq (%esi),%mm1
+	addl $8,%esi
+	pmaddwd (%edi),%mm1
+	addl $8,%edi
+	paddd %mm1,%mm0
+	jmp .Loop2
+.Loop2Done:
+	
+	movd %mm0,%ebx		# right-hand word to ebx
+	punpckhdq %mm0,%mm0	# left-hand word to right side of %mm0
+	movd %mm0,%eax
+	addl %ebx,%eax		# running sum now in %eax
+	emms			# done with MMX
+	
+	popl %ebx
+	popl %ecx
+	popl %edi
+	popl %esi
+	movl %ebp,%esp
+	popl %ebp
+	ret
diff --git a/libfec/dotprod_port.c b/libfec/dotprod_port.c
new file mode 100644
index 0000000..ef635ec
--- /dev/null
+++ b/libfec/dotprod_port.c
@@ -0,0 +1,58 @@
+/* 16-bit signed integer dot product
+ * Portable C version
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+struct dotprod {
+  int len; /* Number of coefficients */
+
+  signed short *coeffs;
+};
+
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_port(signed short coeffs[],int len){
+  struct dotprod *dp;
+  int j;
+
+  if(len == 0)
+    return NULL;
+
+  dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+  dp->len = len;
+
+  /* Just one copy of the coefficients for the C version */
+  dp->coeffs = (signed short *)calloc(len,sizeof(signed short));
+  for(j=0;j<len;j++)
+    dp->coeffs[j] = coeffs[j];
+  return (void *)dp;
+}
+
+
+/* Free a dot product descriptor created earlier */
+void freedp_port(void *p){
+  struct dotprod *dp = (struct dotprod *)p;
+
+  if(dp->coeffs != NULL)
+      free(dp->coeffs);
+  free(dp);
+}
+
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_port(void *p,signed short a[]){
+  struct dotprod *dp = (struct dotprod *)p;
+  long corr;
+  int i;
+
+  corr = 0;
+  for(i=0;i<dp->len;i++){
+    corr += (long)a[i] * dp->coeffs[i];
+  }
+  return corr;
+}
+
+
diff --git a/libfec/dotprod_sse2.c b/libfec/dotprod_sse2.c
new file mode 100644
index 0000000..1fddd18
--- /dev/null
+++ b/libfec/dotprod_sse2.c
@@ -0,0 +1,72 @@
+/* 16-bit signed integer dot product
+ * SSE2 version
+ * Copyright 2004 Phil Karn
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#define _XOPEN_SOURCE 600
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+struct dotprod {
+  int len; /* Number of coefficients */
+
+  /* On a SSE2 machine, these hold 8 copies of the coefficients,
+   * preshifted by 0,1,..7 words to meet all possible input data
+   * alignments (see Intel ap559 on MMX dot products).
+   */
+  signed short *coeffs[8];
+};
+
+long dotprod_sse2_assist(signed short *a,signed short *b,int cnt);
+
+/* Create and return a descriptor for use with the dot product function */
+void *initdp_sse2(signed short coeffs[],int len){
+  struct dotprod *dp;
+  int i,j,blksize;
+
+  if(len == 0)
+    return NULL;
+
+  dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
+  dp->len = len;
+
+  /* Make 8 copies of coefficients, one for each data alignment,
+   * each aligned to 16-byte boundary
+   */
+  for(i=0;i<8;i++){
+    blksize = (1+(len+i-1)/8) * 8*sizeof(signed short);
+    posix_memalign((void **)&dp->coeffs[i],16,blksize);
+    memset(dp->coeffs[i],0,blksize);
+    for(j=0;j<len;j++)
+      dp->coeffs[i][j+i] = coeffs[j];
+  }
+  return (void *)dp;
+}
+
+
+/* Free a dot product descriptor created earlier */
+void freedp_sse2(void *p){
+  struct dotprod *dp = (struct dotprod *)p;
+  int i;
+
+  for(i=0;i<8;i++)
+    if(dp->coeffs[i] != NULL)
+      free(dp->coeffs[i]);
+  free(dp);
+}
+
+/* Compute a dot product given a descriptor and an input array
+ * The length is taken from the descriptor
+ */
+long dotprod_sse2(void *p,signed short a[]){
+  struct dotprod *dp = (struct dotprod *)p;
+  int al;
+  signed short *ar;
+  
+  ar = (signed short *)((int)a & ~15);
+  al = a - ar;
+  
+  /* Call assembler routine to do the work, passing number of 8-word blocks */
+  return dotprod_sse2_assist(ar,dp->coeffs[al],(dp->len+al-1)/8+1);
+}
diff --git a/libfec/dotprod_sse2_assist.s b/libfec/dotprod_sse2_assist.s
new file mode 100644
index 0000000..47348fa
--- /dev/null
+++ b/libfec/dotprod_sse2_assist.s
@@ -0,0 +1,85 @@
+# SIMD SSE2 dot product
+# Equivalent to the following C code:
+# long dotprod(signed short *a,signed short *b,int cnt)
+# {
+#	long sum = 0; 
+#	cnt *= 8; 
+#	while(cnt--)
+#		sum += *a++ + *b++;
+#	return sum;
+# }
+# a and b must be 128-bit aligned
+# Copyright 2001, Phil Karn KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+	
+	.text
+	.global dotprod_sse2_assist
+	.type dotprod_sse2_assist,@function
+dotprod_sse2_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %ecx
+	pushl %ebx
+	movl 8(%ebp),%esi	# a
+	movl 12(%ebp),%edi	# b
+	movl 16(%ebp),%ecx	# cnt
+	pxor %xmm0,%xmm0		# clear running sum (in two 32-bit halves)
+	
+# SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop
+	.align 16
+.Loop1:	subl $4,%ecx
+	jl   .Loop1Done
+	
+	movdqa (%esi),%xmm1
+ 	pmaddwd (%edi),%xmm1
+	paddd %xmm1,%xmm0
+	
+	movdqa 16(%esi),%xmm1
+	pmaddwd 16(%edi),%xmm1
+	paddd %xmm1,%xmm0
+
+	movdqa 32(%esi),%xmm1
+	pmaddwd 32(%edi),%xmm1
+	paddd %xmm1,%xmm0
+
+	movdqa 48(%esi),%xmm1
+	addl $64,%esi	
+	pmaddwd 48(%edi),%xmm1
+	addl $64,%edi	
+	paddd %xmm1,%xmm0
+
+	jmp .Loop1
+.Loop1Done:
+	
+	addl $4,%ecx	
+	
+# SSE2 dot product loop, not unrolled, crunching 4 terms per loop
+# This could be redone as Duff's Device on the unrolled loop above
+.Loop2:	subl $1,%ecx
+	jl   .Loop2Done
+	
+	movdqa (%esi),%xmm1
+	addl $16,%esi
+	pmaddwd (%edi),%xmm1
+	addl $16,%edi
+	paddd %xmm1,%xmm0
+	jmp .Loop2
+.Loop2Done:
+
+	movdqa %xmm0,%xmm1
+	psrldq $8,%xmm0
+	paddd %xmm1,%xmm0
+	movd %xmm0,%eax		# right-hand word to eax
+	psrldq $4,%xmm0
+	movd %xmm0,%ebx
+	addl %ebx,%eax
+
+	popl %ebx
+	popl %ecx
+	popl %edi
+	popl %esi
+	movl %ebp,%esp
+	popl %ebp
+	ret
diff --git a/libfec/dsp.3 b/libfec/dsp.3
new file mode 100644
index 0000000..e9794da
--- /dev/null
+++ b/libfec/dsp.3
@@ -0,0 +1,63 @@
+.TH DSP 3
+.SH NAME
+initdp, freedp, dotprod, sumsq, peakval -\ SIMD-assisted
+digital signal processing primitives
+.SH SYNOPSIS
+.nf
+.ft
+#include "fec.h"
+
+void *initdp(signed short *coeffs,int len);
+long dotprod(void *p,signed short *a);
+void freedp(void *p);
+
+unsigned long long sumsq(signed short *in,int cnt);
+
+int peakval(signed short *b,int cnt);
+
+.SH DESCRIPTION
+These functions provide several basic primitives useful in digital
+signal processing (DSP), especially in modems.  The \fBinitdp\fR,
+\fBdotprod\fR and \fBfreedp\fR functions implement an integer dot
+product useful in correlation and filtering operations on signed
+16-bit integers. \fBsumsq\fR computes the sum
+of the squares of an array of signed 16-bit integers,
+useful for measuring the energy of a signal. \fBpeakval\fR returns the
+absolute value of the largest magitude element in the input array,
+useful for scaling a signal's amplitude.
+
+Each function uses IA32 or PowerPC Altivec instructions when
+available; otherwise, a portable C version is used.
+
+.SH USAGE
+To create a FIR filter or correlator, call \fBinitdp\fR with the
+coefficients in \fBcoeff\fR and their number in \fBlen\fR.  This
+creates the appropriate data structures and returns a handle.
+
+To compute a dot product, pass the handle from \fBinitdp\fR and the
+input array to \fBdotprod\fR. No length field is needed as the number
+of samples will be taken from the \fBlen\fR parameter originally given
+to \fBinitdp\fR. There must be at least as many samples in the input
+array as there were coefficients passed to \fBinitdp\fR.
+
+When the filter or correlator is no longer needed, the data structures
+may be freed by passing the handle to \fBfreedp\fR.
+
+The user is responsible for scaling the inputs to \fBinitdp\fR and
+\fBdotprod\fR, as the 32-bit result from \fBdotprod\fR will silently
+wrap around in the event of overflow.
+
+To compute the sum of the squares of an array of signed 16-bit
+integers, use sumsq\fR. This returns a 64 bit sum.
+
+\fBpeakval\fR computes the absolute value of each 16-bit element in
+the input array and returns the largest.
+
+.SH RETURN VALUES
+
+\fBinitdp\fR returns a handle that points to a control block, or NULL in
+the event of an error (such as a memory allocation failure). \fBsumsq\fR
+and \fBpeakval\fR have no error returns.
+
+.SH AUTHOR and COPYRIGHT
+Phil Karn, KA9Q (karn@ka9q.net)
diff --git a/libfec/dtest.c b/libfec/dtest.c
new file mode 100644
index 0000000..394cb03
--- /dev/null
+++ b/libfec/dtest.c
@@ -0,0 +1,99 @@
+/* Test dot-product function */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include "config.h"
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {"trials",0,NULL,'n'},
+  {NULL},
+};
+#endif
+
+int main(int argc,char *argv[]){
+  short coeffs[512];
+  short input[2048];
+  int trials=1000,d;
+  int errors = 0;
+
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"apmstn:",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"apmstn:")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    }
+  }
+
+  while(trials--){
+    long port_result;
+    long simd_result;
+    int ntaps;
+    int i;
+    int csum = 0;
+    int offset;
+    void *dp_simd,*dp_port;
+
+    /* Generate set of coefficients
+     * limit sum of absolute values to 32767 to avoid overflow
+     */
+    memset(coeffs,0,sizeof(coeffs));
+    for(i=0;i<512;i++){
+      double gv;
+
+      gv = normal_rand(0.,100.);
+      if(csum + fabs(gv) > 32767)
+	break;
+      coeffs[i] = gv;
+      csum += fabs(gv);
+    }
+    ntaps = i;
+
+    /* Compare results to portable C version for a bunch of random data buffers and offsets */
+    dp_simd = initdp(coeffs,ntaps);
+    dp_port = initdp_port(coeffs,ntaps);
+    
+    for(i=0;i<2048;i++)
+      input[i] = random();
+    
+    offset = random() & 511;
+
+    simd_result = dotprod(dp_simd,input+offset);
+    port_result = dotprod_port(dp_port,input+offset);
+    if(simd_result != port_result){
+      errors++;
+    }
+  }
+  printf("dtest: %d errors\n",errors);
+  exit(0);
+}
diff --git a/libfec/encode_rs.c b/libfec/encode_rs.c
new file mode 100644
index 0000000..0649094
--- /dev/null
+++ b/libfec/encode_rs.c
@@ -0,0 +1,52 @@
+/* Reed-Solomon encoder
+ * Copyright 2002, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+
+#ifdef FIXED
+#include "fixed.h"
+#elif defined(BIGSYM)
+#include "int.h"
+#else
+#include "char.h"
+#endif
+
+void ENCODE_RS(
+#ifdef FIXED
+data_t *data, data_t *bb,int pad){
+#else
+void *p,data_t *data, data_t *bb){
+  struct rs *rs = (struct rs *)p;
+#endif
+  int i, j;
+  data_t feedback;
+
+#ifdef FIXED
+  /* Check pad parameter for validity */
+  if(pad < 0 || pad >= NN)
+    return;
+#endif
+
+  memset(bb,0,NROOTS*sizeof(data_t));
+
+  for(i=0;i<NN-NROOTS-PAD;i++){
+    feedback = INDEX_OF[data[i] ^ bb[0]];
+    if(feedback != A0){      /* feedback term is non-zero */
+#ifdef UNNORMALIZED
+      /* This line is unnecessary when GENPOLY[NROOTS] is unity, as it must
+       * always be for the polynomials constructed by init_rs()
+       */
+      feedback = MODNN(NN - GENPOLY[NROOTS] + feedback);
+#endif
+      for(j=1;j<NROOTS;j++)
+	bb[j] ^= ALPHA_TO[MODNN(feedback + GENPOLY[NROOTS-j])];
+    }
+    /* Shift */
+    memmove(&bb[0],&bb[1],sizeof(data_t)*(NROOTS-1));
+    if(feedback != A0)
+      bb[NROOTS-1] = ALPHA_TO[MODNN(feedback + GENPOLY[0])];
+    else
+      bb[NROOTS-1] = 0;
+  }
+}
diff --git a/libfec/encode_rs.h b/libfec/encode_rs.h
new file mode 100644
index 0000000..2c157f9
--- /dev/null
+++ b/libfec/encode_rs.h
@@ -0,0 +1,58 @@
+/* The guts of the Reed-Solomon encoder, meant to be #included
+ * into a function body with the following typedefs, macros and variables supplied
+ * according to the code parameters:
+
+ * data_t - a typedef for the data symbol
+ * data_t data[] - array of NN-NROOTS-PAD and type data_t to be encoded
+ * data_t parity[] - an array of NROOTS and type data_t to be written with parity symbols
+ * NROOTS - the number of roots in the RS code generator polynomial,
+ *          which is the same as the number of parity symbols in a block.
+            Integer variable or literal.
+	    * 
+ * NN - the total number of symbols in a RS block. Integer variable or literal.
+ * PAD - the number of pad symbols in a block. Integer variable or literal.
+ * ALPHA_TO - The address of an array of NN elements to convert Galois field
+ *            elements in index (log) form to polynomial form. Read only.
+ * INDEX_OF - The address of an array of NN elements to convert Galois field
+ *            elements in polynomial form to index (log) form. Read only.
+ * MODNN - a function to reduce its argument modulo NN. May be inline or a macro.
+ * GENPOLY - an array of NROOTS+1 elements containing the generator polynomial in index form
+
+ * The memset() and memmove() functions are used. The appropriate header
+ * file declaring these functions (usually <string.h>) must be included by the calling
+ * program.
+
+ * Copyright 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+
+#undef A0
+#define A0 (NN) /* Special reserved value encoding zero in index form */
+
+{
+  int i, j;
+  data_t feedback;
+
+  memset(parity,0,NROOTS*sizeof(data_t));
+
+  for(i=0;i<NN-NROOTS-PAD;i++){
+    feedback = INDEX_OF[data[i] ^ parity[0]];
+    if(feedback != A0){      /* feedback term is non-zero */
+#ifdef UNNORMALIZED
+      /* This line is unnecessary when GENPOLY[NROOTS] is unity, as it must
+       * always be for the polynomials constructed by init_rs()
+       */
+      feedback = MODNN(NN - GENPOLY[NROOTS] + feedback);
+#endif
+      for(j=1;j<NROOTS;j++)
+	parity[j] ^= ALPHA_TO[MODNN(feedback + GENPOLY[NROOTS-j])];
+    }
+    /* Shift */
+    memmove(&parity[0],&parity[1],sizeof(data_t)*(NROOTS-1));
+    if(feedback != A0)
+      parity[NROOTS-1] = ALPHA_TO[MODNN(feedback + GENPOLY[0])];
+    else
+      parity[NROOTS-1] = 0;
+  }
+}
diff --git a/libfec/encode_rs_8.c b/libfec/encode_rs_8.c
new file mode 100644
index 0000000..d21294c
--- /dev/null
+++ b/libfec/encode_rs_8.c
@@ -0,0 +1,117 @@
+/* Reed-Solomon encoder
+ * Copyright 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+#include "fixed.h"
+#ifdef __VEC__
+#include <sys/sysctl.h>
+#endif
+
+
+static enum {UNKNOWN=0,MMX,SSE,SSE2,ALTIVEC,PORT} cpu_mode;
+
+static void encode_rs_8_c(data_t *data, data_t *parity,int pad);
+#if __vec__
+static void encode_rs_8_av(data_t *data, data_t *parity,int pad);
+#endif
+#if __i386__
+int cpu_features(void);
+#endif
+
+void encode_rs_8(data_t *data, data_t *parity,int pad){
+  if(cpu_mode == UNKNOWN){
+#ifdef __i386__
+    int f;
+    /* Figure out what kind of CPU we have */
+    f = cpu_features();
+    if(f & (1<<26)){ /* SSE2 is present */
+      cpu_mode = SSE2;
+    } else if(f & (1<<25)){ /* SSE is present */
+      cpu_mode = SSE;
+    } else if(f & (1<<23)){ /* MMX is present */
+      cpu_mode = MMX;
+    } else { /* No SIMD at all */
+      cpu_mode = PORT;
+    }
+#elif __x86_64__
+    cpu_mode = SSE2;
+#elif __VEC__
+    /* Ask the OS if we have Altivec support */
+    int selectors[2] = { CTL_HW, HW_VECTORUNIT };
+    int hasVectorUnit = 0;
+    size_t length = sizeof(hasVectorUnit);
+    int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
+    if(0 == error && hasVectorUnit)
+      cpu_mode = ALTIVEC;
+    else
+      cpu_mode = PORT;
+#else
+    cpu_mode = PORT;
+#endif
+  }
+  switch(cpu_mode){
+#if __vec__
+  case ALTIVEC:
+    encode_rs_8_av(data,parity,pad);
+    return;
+#endif
+
+#if __i386__
+  case MMX:
+  case SSE:
+  case SSE2:
+#endif
+
+#ifdef __x86_64__
+  case SSE2:
+#endif
+
+  default:
+    encode_rs_8_c(data,parity,pad);
+    return;
+  }
+}
+
+#if __vec__ /* PowerPC G4/G5 Altivec instructions are available */
+
+static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1);
+static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
+
+/* Lookup table for feedback multiplications
+ * These are the low half of the coefficients. Since the generator polynomial is
+ * palindromic, we form the other half by reversing this one
+ */
+extern static union { vector unsigned char v; unsigned char c[16]; } table[256];
+
+static void encode_rs_8_av(data_t *data, data_t *parity,int pad){
+  union { vector unsigned char v[2]; unsigned char c[32]; } shift_register;
+  int i;
+
+  shift_register.v[0] = (vector unsigned char)(0);
+  shift_register.v[1] = (vector unsigned char)(0);
+  
+  for(i=0;i<NN-NROOTS-pad;i++){
+    vector unsigned char feedback0,feedback1;
+    unsigned char f;
+
+    f = data[i] ^ shift_register.c[31];
+    feedback1 = table[f].v;
+    feedback0 = vec_perm(feedback1,feedback1,reverse);
+
+    /* Shift right one byte */
+    shift_register.v[1] = vec_perm(shift_register.v[0],shift_register.v[1],shift_right) ^ feedback1;
+    shift_register.v[0] = vec_sro(shift_register.v[0],(vector unsigned char)(8)) ^ feedback0;
+    shift_register.c[0] = f;
+  }
+  for(i=0;i<NROOTS;i++)
+    parity[NROOTS-i-1] = shift_register.c[i];
+}
+#endif
+
+/* Portable C version */
+static void encode_rs_8_c(data_t *data, data_t *parity,int pad){
+
+#include "encode_rs.h"
+
+}
diff --git a/libfec/encode_rs_av.c b/libfec/encode_rs_av.c
new file mode 100644
index 0000000..32e528f
--- /dev/null
+++ b/libfec/encode_rs_av.c
@@ -0,0 +1,61 @@
+/* Fast Reed-Solomon encoder for (255,223) CCSDS code on PowerPC G4/G5 using Altivec instructions
+ * Copyright 2004, Phil Karn KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <string.h>
+#include "fixed.h"
+
+/* Lookup table for feedback multiplications
+ * These are the low half of the coefficients. Since the generator polynomial is
+ * palindromic, we form it by reversing these on the fly
+ */
+static union { vector unsigned char v; unsigned char c[16]; } table[256];
+
+static vector unsigned char reverse = (vector unsigned char)(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1);
+static vector unsigned char shift_right = (vector unsigned char)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
+
+extern data_t CCSDS_alpha_to[];
+extern data_t CCSDS_index_of[];
+extern data_t CCSDS_poly[];
+
+void rs_init_av(){
+  int i,j;
+
+  /* The PowerPC is big-endian, so the low-order byte of each vector contains the highest order term in the polynomial */
+  for(j=0;j<16;j++){
+    table[0].c[j] = 0;
+    for(i=1;i<256;i++){
+      table[i].c[16-j-1] = CCSDS_alpha_to[MODNN(CCSDS_poly[j+1] + CCSDS_index_of[i])];
+    }
+  }
+#if 0
+  for(i=0;i<256;i++){
+    printf("table[%3d] = %3vu\n",i,table[i].v);
+  }
+#endif
+}
+
+void encode_rs_av(unsigned char *data,unsigned char *parity,int pad){
+  union { vector unsigned char v[2]; unsigned char c[32]; } shift_register;
+  int i;
+
+  shift_register.v[0] = (vector unsigned char)(0);
+  shift_register.v[1] = (vector unsigned char)(0);
+  
+  for(i=0;i<NN-NROOTS-pad;i++){
+    vector unsigned char feedback0,feedback1;
+    unsigned char f;
+
+    f = data[i] ^ shift_register.c[31];
+    feedback1 = table[f].v;
+    feedback0 = vec_perm(feedback1,feedback1,reverse);
+
+    /* Shift right one byte */
+    shift_register.v[1] = vec_perm(shift_register.v[0],shift_register.v[1],shift_right) ^ feedback1;
+    shift_register.v[0] = vec_sro(shift_register.v[0],(vector unsigned char)(8)) ^ feedback0;
+    shift_register.c[0] = f;
+  }
+  for(i=0;i<NROOTS;i++)
+    parity[NROOTS-i-1] = shift_register.c[i];
+}
diff --git a/libfec/encode_rs_ccsds.c b/libfec/encode_rs_ccsds.c
new file mode 100644
index 0000000..5a2ec70
--- /dev/null
+++ b/libfec/encode_rs_ccsds.c
@@ -0,0 +1,24 @@
+/* This function wraps around the fixed 8-bit encoder, performing the
+ * basis transformations necessary to meet the CCSDS standard
+ *
+ * Copyright 2002, Phil Karn, KA9Q
+ * fixed bug Aug 2007
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include "ccsds.h"
+#include "fec.h"
+
+void encode_rs_ccsds(data_t *data,data_t *parity,int pad){
+  int i;
+  data_t cdata[NN-NROOTS];
+
+  /* Convert data from dual basis to conventional */
+  for(i=0;i<NN-NROOTS-pad;i++)
+    cdata[i] = Tal1tab[data[i]];
+
+  encode_rs_8(cdata,parity,pad);
+
+  /* Convert parity from conventional to dual basis */
+  for(i=0;i<NROOTS;i++)
+    parity[i] = Taltab[parity[i]];
+}
diff --git a/libfec/encode_rs_char.c b/libfec/encode_rs_char.c
new file mode 100644
index 0000000..a9bf2b8
--- /dev/null
+++ b/libfec/encode_rs_char.c
@@ -0,0 +1,15 @@
+/* Reed-Solomon encoder
+ * Copyright 2002, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+
+#include "char.h"
+#include "rs-common.h"
+
+void encode_rs_char(void *p,data_t *data, data_t *parity){
+  struct rs *rs = (struct rs *)p;
+
+#include "encode_rs.h"
+
+}
diff --git a/libfec/encode_rs_int.c b/libfec/encode_rs_int.c
new file mode 100644
index 0000000..3c9ce78
--- /dev/null
+++ b/libfec/encode_rs_int.c
@@ -0,0 +1,15 @@
+/* Reed-Solomon encoder
+ * Copyright 2003, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <string.h>
+
+#include "int.h"
+#include "rs-common.h"
+
+void encode_rs_int(void *p,data_t *data, data_t *parity){
+  struct rs *rs = (struct rs *)p;
+
+#include "encode_rs.h"
+
+}
diff --git a/libfec/exercise.c b/libfec/exercise.c
new file mode 100644
index 0000000..8ae008c
--- /dev/null
+++ b/libfec/exercise.c
@@ -0,0 +1,122 @@
+/* Exercise an RS codec a specified number of times using random
+ * data and error patterns
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#define FLAG_ERASURE 1 /* Randomly flag 50% of errors as erasures */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef FIXED
+#include "fixed.h"
+#define EXERCISE exercise_8
+#elif defined(CCSDS)
+#include "fixed.h"
+#include "ccsds.h"
+#define EXERCISE exercise_ccsds
+#elif defined(BIGSYM)
+#include "int.h"
+#define EXERCISE exercise_int
+#else
+#include "char.h"
+#define EXERCISE exercise_char
+#endif
+
+#ifdef FIXED
+#define PRINTPARM printf("(255,223):");
+#elif defined(CCSDS)
+#define PRINTPARM printf("CCSDS (255,223):");
+#else
+#define PRINTPARM printf("(%d,%d):",rs->nn,rs->nn-rs->nroots);
+#endif
+
+/* Exercise the RS codec passed as an argument */
+int EXERCISE(
+#if !defined(CCSDS) && !defined(FIXED)
+void *p,
+#endif
+int trials){
+#if !defined(CCSDS) && !defined(FIXED)
+  struct rs *rs = (struct rs *)p;
+#endif
+  data_t block[NN],tblock[NN];
+  int i;
+  int errors;
+  int errlocs[NN];
+  int derrlocs[NROOTS];
+  int derrors;
+  int errval,errloc;
+  int erasures;
+  int decoder_errors = 0;
+
+  while(trials-- != 0){
+    /* Test up to the error correction capacity of the code */
+    for(errors=0;errors <= NROOTS/2;errors++){
+
+      /* Load block with random data and encode */
+      for(i=0;i<NN-NROOTS;i++)
+	block[i] = random() & NN;
+      
+#if defined(CCSDS) || defined(FIXED)
+      ENCODE_RS(&block[0],&block[NN-NROOTS],0);
+#else
+      ENCODE_RS(rs,&block[0],&block[NN-NROOTS]);
+#endif
+
+      /* Make temp copy, seed with errors */
+      memcpy(tblock,block,sizeof(tblock));
+      memset(errlocs,0,sizeof(errlocs));
+      memset(derrlocs,0,sizeof(derrlocs));
+      erasures=0;
+      for(i=0;i<errors;i++){
+	do {
+	  errval = random() & NN;
+	} while(errval == 0); /* Error value must be nonzero */
+
+	do {
+	  errloc = random() % NN;
+	} while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+
+	errlocs[errloc] = 1;
+
+#if FLAG_ERASURE
+	if(random() & 1) /* 50-50 chance */
+	  derrlocs[erasures++] = errloc;
+#endif
+	tblock[errloc] ^= errval;
+      }
+
+      /* Decode the errored block */
+#if defined(CCSDS) || defined(FIXED)
+      derrors = DECODE_RS(tblock,derrlocs,erasures,0);
+#else
+      derrors = DECODE_RS(rs,tblock,derrlocs,erasures);
+#endif
+
+      if(derrors != errors){
+	PRINTPARM
+	printf(" decoder says %d errors, true number is %d\n",derrors,errors);
+	decoder_errors++;
+      }
+      for(i=0;i<derrors;i++){
+	if(errlocs[derrlocs[i]] == 0){
+	  PRINTPARM
+	  printf(" decoder indicates error in location %d without error\n",derrlocs[i]);
+	  decoder_errors++;
+	}
+      }
+      if(memcmp(tblock,block,sizeof(tblock)) != 0){
+	PRINTPARM
+	printf(" uncorrected errors! output ^ input:");
+	decoder_errors++;
+	for(i=0;i<NN;i++)
+	  printf(" %02x",tblock[i] ^ block[i]);
+	printf("\n");
+      }
+    }
+  }
+  return decoder_errors;
+}
diff --git a/libfec/fec.c b/libfec/fec.c
new file mode 100644
index 0000000..35960c3
--- /dev/null
+++ b/libfec/fec.c
@@ -0,0 +1,66 @@
+/* Utility routines for FEC support
+ * Copyright 2004, Phil Karn, KA9Q
+ */
+
+#include <stdio.h>
+#include "fec.h"
+
+unsigned char Partab[256];
+int P_init;
+
+/* Create 256-entry odd-parity lookup table
+ * Needed only on non-ia32 machines
+ */
+void partab_init(void){
+  int i,cnt,ti;
+
+  /* Initialize parity lookup table */
+  for(i=0;i<256;i++){
+    cnt = 0;
+    ti = i;
+    while(ti){
+      if(ti & 1)
+	cnt++;
+      ti >>= 1;
+    }
+    Partab[i] = cnt & 1;
+  }
+  P_init=1;
+}
+
+/* Lookup table giving count of 1 bits for integers 0-255 */
+int Bitcnt[] = {
+ 0, 1, 1, 2, 1, 2, 2, 3,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 4, 5, 5, 6, 5, 6, 6, 7,
+ 5, 6, 6, 7, 6, 7, 7, 8,
+};
+
diff --git a/libfec/fec.h b/libfec/fec.h
new file mode 100644
index 0000000..d6d4b08
--- /dev/null
+++ b/libfec/fec.h
@@ -0,0 +1,355 @@
+/* User include file for libfec
+ * Copyright 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#ifndef _FEC_H_
+#define _FEC_H_
+
+/* r=1/2 k=7 convolutional encoder polynomials
+ * The NASA-DSN convention is to use V27POLYA inverted, then V27POLYB
+ * The CCSDS/NASA-GSFC convention is to use V27POLYB, then V27POLYA inverted
+ */
+#define	V27POLYA	0x6d
+#define	V27POLYB	0x4f
+
+void *create_viterbi27(int len);
+void set_viterbi27_polynomial(int polys[2]);
+int init_viterbi27(void *vp,int starting_state);
+int update_viterbi27_blk(void *vp,unsigned char sym[],int npairs);
+int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27(void *vp);
+
+#ifdef __VEC__
+void *create_viterbi27_av(int len);
+void set_viterbi27_polynomial_av(int polys[2]);
+int init_viterbi27_av(void *p,int starting_state);
+int chainback_viterbi27_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_av(void *p);
+int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits);
+#endif
+
+#ifdef __i386__
+void *create_viterbi27_mmx(int len);
+void set_viterbi27_polynomial_mmx(int polys[2]);
+int init_viterbi27_mmx(void *p,int starting_state);
+int chainback_viterbi27_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_mmx(void *p);
+int update_viterbi27_blk_mmx(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi27_sse(int len);
+void set_viterbi27_polynomial_sse(int polys[2]);
+int init_viterbi27_sse(void *p,int starting_state);
+int chainback_viterbi27_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_sse(void *p);
+int update_viterbi27_blk_sse(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi27_sse2(int len);
+void set_viterbi27_polynomial_sse2(int polys[2]);
+int init_viterbi27_sse2(void *p,int starting_state);
+int chainback_viterbi27_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_sse2(void *p);
+int update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits);
+#endif
+
+void *create_viterbi27_port(int len);
+void set_viterbi27_polynomial_port(int polys[2]);
+int init_viterbi27_port(void *p,int starting_state);
+int chainback_viterbi27_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27_port(void *p);
+int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits);
+
+/* r=1/2 k=9 convolutional encoder polynomials */
+#define	V29POLYA	0x1af
+#define	V29POLYB	0x11d
+
+void *create_viterbi29(int len);
+void set_viterbi29_polynomial(int polys[2]);
+int init_viterbi29(void *vp,int starting_state);
+int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29(void *vp);
+
+#ifdef __VEC__
+void *create_viterbi29_av(int len);
+void set_viterbi29_polynomial_av(int polys[2]);
+int init_viterbi29_av(void *p,int starting_state);
+int chainback_viterbi29_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_av(void *p);
+int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits);
+#endif
+
+#ifdef __i386__
+void *create_viterbi29_mmx(int len);
+void set_viterbi29_polynomial_mmx(int polys[2]);
+int init_viterbi29_mmx(void *p,int starting_state);
+int chainback_viterbi29_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_mmx(void *p);
+int update_viterbi29_blk_mmx(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi29_sse(int len);
+void set_viterbi29_polynomial_sse(int polys[2]);
+int init_viterbi29_sse(void *p,int starting_state);
+int chainback_viterbi29_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_sse(void *p);
+int update_viterbi29_blk_sse(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi29_sse2(int len);
+void set_viterbi29_polynomial_sse2(int polys[2]);
+int init_viterbi29_sse2(void *p,int starting_state);
+int chainback_viterbi29_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_sse2(void *p);
+int update_viterbi29_blk_sse2(void *p,unsigned char *syms,int nbits);
+#endif
+
+void *create_viterbi29_port(int len);
+void set_viterbi29_polynomial_port(int polys[2]);
+int init_viterbi29_port(void *p,int starting_state);
+int chainback_viterbi29_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29_port(void *p);
+int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits);
+
+/* r=1/3 k=9 convolutional encoder polynomials */
+#define	V39POLYA	0x1ed
+#define	V39POLYB	0x19b
+#define	V39POLYC	0x127
+
+void *create_viterbi39(int len);
+void set_viterbi39_polynomial(int polys[3]);
+int init_viterbi39(void *vp,int starting_state);
+int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39(void *vp);
+
+#ifdef __VEC__
+void *create_viterbi39_av(int len);
+void set_viterbi39_polynomial_av(int polys[3]);
+int init_viterbi39_av(void *p,int starting_state);
+int chainback_viterbi39_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_av(void *p);
+int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits);
+#endif
+
+#ifdef __i386__
+void *create_viterbi39_mmx(int len);
+void set_viterbi39_polynomial_mmx(int polys[3]);
+int init_viterbi39_mmx(void *p,int starting_state);
+int chainback_viterbi39_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_mmx(void *p);
+int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi39_sse(int len);
+void set_viterbi39_polynomial_sse(int polys[3]);
+int init_viterbi39_sse(void *p,int starting_state);
+int chainback_viterbi39_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_sse(void *p);
+int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi39_sse2(int len);
+void set_viterbi39_polynomial_sse2(int polys[3]);
+int init_viterbi39_sse2(void *p,int starting_state);
+int chainback_viterbi39_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_sse2(void *p);
+int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits);
+#endif
+
+void *create_viterbi39_port(int len);
+void set_viterbi39_polynomial_port(int polys[3]);
+int init_viterbi39_port(void *p,int starting_state);
+int chainback_viterbi39_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39_port(void *p);
+int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits);
+
+
+/* r=1/6 k=15 Cassini convolutional encoder polynomials without symbol inversion
+ * dfree = 56
+ * These bits may be left-right flipped from some textbook representations;
+ * here I have the bits entering the shift register from the right (low) end
+ *
+ * Some other spacecraft use the same code, but with the polynomials in a different order.
+ * E.g., Mars Pathfinder and STEREO swap POLYC and POLYD. All use alternate symbol inversion,
+ * so use set_viterbi615_polynomial() as appropriate.
+ */
+#define	V615POLYA	042631
+#define	V615POLYB	047245
+#define V615POLYC       056507
+#define V615POLYD       073363
+#define V615POLYE       077267
+#define V615POLYF       064537
+
+void *create_viterbi615(int len);
+void set_viterbi615_polynomial(int polys[6]);
+int init_viterbi615(void *vp,int starting_state);
+int update_viterbi615_blk(void *vp,unsigned char *syms,int nbits);
+int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615(void *vp);
+
+#ifdef __VEC__
+void *create_viterbi615_av(int len);
+void set_viterbi615_polynomial_av(int polys[6]);
+int init_viterbi615_av(void *p,int starting_state);
+int chainback_viterbi615_av(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_av(void *p);
+int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits);
+#endif
+
+#ifdef __i386__
+void *create_viterbi615_mmx(int len);
+void set_viterbi615_polynomial_mmx(int polys[6]);
+int init_viterbi615_mmx(void *p,int starting_state);
+int chainback_viterbi615_mmx(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_mmx(void *p);
+int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi615_sse(int len);
+void set_viterbi615_polynomial_sse(int polys[6]);
+int init_viterbi615_sse(void *p,int starting_state);
+int chainback_viterbi615_sse(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_sse(void *p);
+int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits);
+
+void *create_viterbi615_sse2(int len);
+void set_viterbi615_polynomial_sse2(int polys[6]);
+int init_viterbi615_sse2(void *p,int starting_state);
+int chainback_viterbi615_sse2(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_sse2(void *p);
+int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits);
+#endif
+
+void *create_viterbi615_port(int len);
+void set_viterbi615_polynomial_port(int polys[6]);
+int init_viterbi615_port(void *p,int starting_state);
+int chainback_viterbi615_port(void *p,unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615_port(void *p);
+int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits);
+
+
+/* General purpose RS codec, 8-bit symbols */
+void encode_rs_char(void *rs,unsigned char *data,unsigned char *parity);
+int decode_rs_char(void *rs,unsigned char *data,int *eras_pos,
+		   int no_eras);
+void *init_rs_char(int symsize,int gfpoly,
+		   int fcr,int prim,int nroots,
+		   int pad);
+void free_rs_char(void *rs);
+
+/* General purpose RS codec, integer symbols */
+void encode_rs_int(void *rs,int *data,int *parity);
+int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras);
+void *init_rs_int(int symsize,int gfpoly,int fcr,
+		  int prim,int nroots,int pad);
+void free_rs_int(void *rs);
+
+/* CCSDS standard (255,223) RS codec with conventional (*not* dual-basis)
+ * symbol representation
+ */
+void encode_rs_8(unsigned char *data,unsigned char *parity,int pad);
+int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras,int pad);
+
+/* CCSDS standard (255,223) RS codec with dual-basis symbol representation */
+void encode_rs_ccsds(unsigned char *data,unsigned char *parity,int pad);
+int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras,int pad);
+
+/* Tables to map from conventional->dual (Taltab) and
+ * dual->conventional (Tal1tab) bases
+ */
+extern unsigned char Taltab[],Tal1tab[];
+
+
+/* CPU SIMD instruction set available */
+extern enum cpu_mode {UNKNOWN=0,PORT,MMX,SSE,SSE2,ALTIVEC} Cpu_mode;
+void find_cpu_mode(void); /* Call this once at startup to set Cpu_mode */
+
+/* Determine parity of argument: 1 = odd, 0 = even */
+#if defined(__i386__) || defined(__x86_64__) 
+static inline int parityb(unsigned char x){
+  __asm__ __volatile__ ("test %1,%1;setpo %0" : "=q"(x) : "q" (x));
+  return x;
+}
+#else
+void partab_init();
+
+static inline int parityb(unsigned char x){
+  extern unsigned char Partab[256];
+  extern int P_init;
+  if(!P_init){
+    partab_init();
+  }
+  return Partab[x];
+}
+#endif
+
+
+static inline int parity(int x){
+  /* Fold down to one byte */
+  x ^= (x >> 16);
+  x ^= (x >> 8);
+  return parityb(x);
+}
+
+/* Useful utilities for simulation */
+double normal_rand(double mean, double std_dev);
+unsigned char addnoise(int sym,double amp,double gain,double offset,int clip);
+
+extern int Bitcnt[];
+
+/* Dot product functions */
+void *initdp(signed short coeffs[],int len);
+void freedp(void *dp);
+long dotprod(void *dp,signed short a[]);
+
+void *initdp_port(signed short coeffs[],int len);
+void freedp_port(void *dp);
+long dotprod_port(void *dp,signed short a[]);
+
+#ifdef __i386__
+void *initdp_mmx(signed short coeffs[],int len);
+void freedp_mmx(void *dp);
+long dotprod_mmx(void *dp,signed short a[]);
+
+void *initdp_sse(signed short coeffs[],int len);
+void freedp_sse(void *dp);
+long dotprod_sse(void *dp,signed short a[]);
+
+void *initdp_sse2(signed short coeffs[],int len);
+void freedp_sse2(void *dp);
+long dotprod_sse2(void *dp,signed short a[]);
+#endif
+
+#ifdef __x86_64__
+void *initdp_sse2(signed short coeffs[],int len);
+void freedp_sse2(void *dp);
+long dotprod_sse2(void *dp,signed short a[]);
+#endif
+
+#ifdef __VEC__
+void *initdp_av(signed short coeffs[],int len);
+void freedp_av(void *dp);
+long dotprod_av(void *dp,signed short a[]);
+#endif
+
+/* Sum of squares - accepts signed shorts, produces unsigned long long */
+unsigned long long sumsq(signed short *in,int cnt);
+unsigned long long sumsq_port(signed short *in,int cnt);
+
+#ifdef __i386__
+unsigned long long sumsq_mmx(signed short *in,int cnt);
+unsigned long long sumsq_sse(signed short *in,int cnt);
+unsigned long long sumsq_sse2(signed short *in,int cnt);
+#endif
+#ifdef __x86_64__
+unsigned long long sumsq_sse2(signed short *in,int cnt);
+#endif
+#ifdef __VEC__
+unsigned long long sumsq_av(signed short *in,int cnt);
+#endif
+
+
+/* Low-level data structures and routines */
+
+int cpu_features(void);
+
+#endif /* _FEC_H_ */
+
+
+
diff --git a/libfec/fixed.h b/libfec/fixed.h
new file mode 100644
index 0000000..0ff27b2
--- /dev/null
+++ b/libfec/fixed.h
@@ -0,0 +1,33 @@
+/* Stuff specific to the CCSDS (255,223) RS codec
+ * (255,223) code over GF(256). Note: the conventional basis is still
+ * used; the dual-basis mappings are performed in [en|de]code_rs_ccsds.c
+ *
+ * Copyright 2003 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+typedef unsigned char data_t;
+
+static inline int mod255(int x){
+  while (x >= 255) {
+    x -= 255;
+    x = (x >> 8) + (x & 255);
+  }
+  return x;
+}
+#define MODNN(x) mod255(x)
+
+extern data_t CCSDS_alpha_to[];
+extern data_t CCSDS_index_of[];
+extern data_t CCSDS_poly[];
+
+#define MM 8
+#define NN 255
+#define ALPHA_TO CCSDS_alpha_to
+#define INDEX_OF CCSDS_index_of
+#define GENPOLY CCSDS_poly
+#define NROOTS 32
+#define FCR 112
+#define PRIM 11
+#define IPRIM 116
+#define PAD pad
+
diff --git a/libfec/gen_ccsds.c b/libfec/gen_ccsds.c
new file mode 100644
index 0000000..e1e2e26
--- /dev/null
+++ b/libfec/gen_ccsds.c
@@ -0,0 +1,39 @@
+/* Generate tables for CCSDS code
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "char.h"
+#include "rs-common.h"
+#include "fec.h"
+
+int main(){
+  struct rs *rs;
+  int i;
+
+  rs = init_rs_char(8,0x187,112,11,32,0); /* CCSDS standard */
+  assert(rs != NULL);
+  printf("char CCSDS_alpha_to[] = {");
+  for(i=0;i<256;i++){
+    if((i % 16) == 0)
+      printf("\n");
+    printf("0x%02x,",rs->alpha_to[i]);
+  }
+  printf("\n};\n\nchar CCSDS_index_of[] = {");
+  for(i=0;i<256;i++){
+    if((i % 16) == 0)
+      printf("\n");
+    printf("%3d,",rs->index_of[i]);
+  }
+  printf("\n};\n\nchar CCSDS_poly[] = {");
+  for(i=0;i<33;i++){
+    if((i % 16) == 0)
+      printf("\n");
+
+    printf("%3d,",rs->genpoly[i]);
+  }
+  printf("\n};\n");
+  exit(0);
+}
diff --git a/libfec/gen_ccsds_tal.c b/libfec/gen_ccsds_tal.c
new file mode 100644
index 0000000..fc75503
--- /dev/null
+++ b/libfec/gen_ccsds_tal.c
@@ -0,0 +1,53 @@
+/* Conversion lookup tables from conventional alpha to Berlekamp's
+ * dual-basis representation. Used in the CCSDS version only.
+ * taltab[] -- convert conventional to dual basis
+ * tal1tab[] -- convert dual basis to conventional
+
+ * Note: the actual RS encoder/decoder works with the conventional basis.
+ * So data is converted from dual to conventional basis before either
+ * encoding or decoding and then converted back.
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#define DTYPE unsigned char
+DTYPE Taltab[256],Tal1tab[256];
+
+static DTYPE tal[] = { 0x8d, 0xef, 0xec, 0x86, 0xfa, 0x99, 0xaf, 0x7b };
+
+/* Generate conversion lookup tables between conventional alpha representation
+ * (@**7, @**6, ...@**0)
+ *  and Berlekamp's dual basis representation
+ * (l0, l1, ...l7)
+ */
+int main(){
+  int i,j,k;
+
+  for(i=0;i<256;i++){/* For each value of input */
+    Taltab[i] = 0;
+    for(j=0;j<8;j++) /* for each column of matrix */
+      for(k=0;k<8;k++){ /* for each row of matrix */
+	if(i & (1<<k))
+	   Taltab[i] ^= tal[7-k] & (1<<j);
+      }
+    Tal1tab[Taltab[i]] = i;
+  }
+  printf("unsigned char Taltab[] = {\n");
+  for(i=0;i<256;i++){
+    if((i % 16) == 0)
+      printf("\n");
+    printf("0x%02x,",Taltab[i]);
+  }
+  printf("\n};\n\nunsigned char Tal1tab[] = {");
+  for(i=0;i<256;i++){
+    if((i % 16) == 0)
+      printf("\n");
+    printf("0x%02x,",Tal1tab[i]);
+  }
+  printf("\n};\n");
+  exit(0);
+}
+
diff --git a/libfec/init_rs.c b/libfec/init_rs.c
new file mode 100644
index 0000000..ef1cf47
--- /dev/null
+++ b/libfec/init_rs.c
@@ -0,0 +1,39 @@
+/* Initialize a RS codec
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+#if !defined(NULL)
+#define NULL ((void *)0)
+#endif
+
+#include "rs-common.h"
+
+void free_rs(void *p){
+  struct rs *rs = (struct rs *)p;
+
+  free(rs->alpha_to);
+  free(rs->index_of);
+  free(rs->genpoly);
+  free(rs);
+}
+
+/* Initialize a Reed-Solomon codec
+ * symsize = symbol size, bits
+ * gfpoly = Field generator polynomial coefficients
+ * fcr = first root of RS code generator polynomial, index form
+ * prim = primitive element to generate polynomial roots
+ * nroots = RS code generator polynomial degree (number of roots)
+ * pad = padding bytes at front of shortened block
+ */
+void *init_rs_common(int symsize,int gfpoly,int fcr,int prim,
+	int nroots,int pad){
+  struct rs *rs;
+
+#include "init_rs.h"
+
+  return rs;
+}
diff --git a/libfec/init_rs.h b/libfec/init_rs.h
new file mode 100644
index 0000000..2b2ae98
--- /dev/null
+++ b/libfec/init_rs.h
@@ -0,0 +1,106 @@
+/* Common code for intializing a Reed-Solomon control block (char or int symbols)
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#undef NULL
+#define NULL ((void *)0)
+
+{
+  int i, j, sr,root,iprim;
+
+  rs = NULL;
+  /* Check parameter ranges */
+  if(symsize < 0 || symsize > 8*sizeof(data_t)){
+    goto done;
+  }
+
+  if(fcr < 0 || fcr >= (1<<symsize))
+    goto done;
+  if(prim <= 0 || prim >= (1<<symsize))
+    goto done;
+  if(nroots < 0 || nroots >= (1<<symsize))
+    goto done; /* Can't have more roots than symbol values! */
+  if(pad < 0 || pad >= ((1<<symsize) -1 - nroots))
+    goto done; /* Too much padding */
+
+  rs = (struct rs *)calloc(1,sizeof(struct rs));
+  if(rs == NULL)
+    goto done;
+
+  rs->mm = symsize;
+  rs->nn = (1<<symsize)-1;
+  rs->pad = pad;
+
+  rs->alpha_to = (data_t *)malloc(sizeof(data_t)*(rs->nn+1));
+  if(rs->alpha_to == NULL){
+    free(rs);
+    rs = NULL;
+    goto done;
+  }
+  rs->index_of = (data_t *)malloc(sizeof(data_t)*(rs->nn+1));
+  if(rs->index_of == NULL){
+    free(rs->alpha_to);
+    free(rs);
+    rs = NULL;
+    goto done;
+  }
+
+  /* Generate Galois field lookup tables */
+  rs->index_of[0] = A0; /* log(zero) = -inf */
+  rs->alpha_to[A0] = 0; /* alpha**-inf = 0 */
+  sr = 1;
+  for(i=0;i<rs->nn;i++){
+    rs->index_of[sr] = i;
+    rs->alpha_to[i] = sr;
+    sr <<= 1;
+    if(sr & (1<<symsize))
+      sr ^= gfpoly;
+    sr &= rs->nn;
+  }
+  if(sr != 1){
+    /* field generator polynomial is not primitive! */
+    free(rs->alpha_to);
+    free(rs->index_of);
+    free(rs);
+    rs = NULL;
+    goto done;
+  }
+
+  /* Form RS code generator polynomial from its roots */
+  rs->genpoly = (data_t *)malloc(sizeof(data_t)*(nroots+1));
+  if(rs->genpoly == NULL){
+    free(rs->alpha_to);
+    free(rs->index_of);
+    free(rs);
+    rs = NULL;
+    goto done;
+  }
+  rs->fcr = fcr;
+  rs->prim = prim;
+  rs->nroots = nroots;
+
+  /* Find prim-th root of 1, used in decoding */
+  for(iprim=1;(iprim % prim) != 0;iprim += rs->nn)
+    ;
+  rs->iprim = iprim / prim;
+
+  rs->genpoly[0] = 1;
+  for (i = 0,root=fcr*prim; i < nroots; i++,root += prim) {
+    rs->genpoly[i+1] = 1;
+
+    /* Multiply rs->genpoly[] by  @**(root + x) */
+    for (j = i; j > 0; j--){
+      if (rs->genpoly[j] != 0)
+	rs->genpoly[j] = rs->genpoly[j-1] ^ rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[j]] + root)];
+      else
+	rs->genpoly[j] = rs->genpoly[j-1];
+    }
+    /* rs->genpoly[0] can never be zero */
+    rs->genpoly[0] = rs->alpha_to[modnn(rs,rs->index_of[rs->genpoly[0]] + root)];
+  }
+  /* convert rs->genpoly[] to index form for quicker encoding */
+  for (i = 0; i <= nroots; i++)
+    rs->genpoly[i] = rs->index_of[rs->genpoly[i]];
+ done:;
+
+}
diff --git a/libfec/init_rs_char.c b/libfec/init_rs_char.c
new file mode 100644
index 0000000..a51099a
--- /dev/null
+++ b/libfec/init_rs_char.c
@@ -0,0 +1,35 @@
+/* Initialize a RS codec
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+
+#include "char.h"
+#include "rs-common.h"
+
+void free_rs_char(void *p){
+  struct rs *rs = (struct rs *)p;
+
+  free(rs->alpha_to);
+  free(rs->index_of);
+  free(rs->genpoly);
+  free(rs);
+}
+
+/* Initialize a Reed-Solomon codec
+ * symsize = symbol size, bits
+ * gfpoly = Field generator polynomial coefficients
+ * fcr = first root of RS code generator polynomial, index form
+ * prim = primitive element to generate polynomial roots
+ * nroots = RS code generator polynomial degree (number of roots)
+ * pad = padding bytes at front of shortened block
+ */
+void *init_rs_char(int symsize,int gfpoly,int fcr,int prim,
+	int nroots,int pad){
+  struct rs *rs;
+
+#include "init_rs.h"
+
+  return rs;
+}
diff --git a/libfec/init_rs_int.c b/libfec/init_rs_int.c
new file mode 100644
index 0000000..a6036c2
--- /dev/null
+++ b/libfec/init_rs_int.c
@@ -0,0 +1,35 @@
+/* Initialize a RS codec
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdlib.h>
+
+#include "int.h"
+#include "rs-common.h"
+
+void free_rs_int(void *p){
+  struct rs *rs = (struct rs *)p;
+
+  free(rs->alpha_to);
+  free(rs->index_of);
+  free(rs->genpoly);
+  free(rs);
+}
+
+/* Initialize a Reed-Solomon codec
+ * symsize = symbol size, bits
+ * gfpoly = Field generator polynomial coefficients
+ * fcr = first root of RS code generator polynomial, index form
+ * prim = primitive element to generate polynomial roots
+ * nroots = RS code generator polynomial degree (number of roots)
+ * pad = padding bytes at front of shortened block
+ */
+void *init_rs_int(int symsize,int gfpoly,int fcr,int prim,
+	int nroots,int pad){
+  struct rs *rs;
+
+#include "init_rs.h"
+
+  return rs;
+}
diff --git a/libfec/install-sh b/libfec/install-sh
new file mode 100755
index 0000000..e9de238
--- /dev/null
+++ b/libfec/install-sh
@@ -0,0 +1,251 @@
+#!/bin/sh
+#
+# install - install a program, script, or datafile
+# This comes from X11R5 (mit/util/scripts/install.sh).
+#
+# Copyright 1991 by the Massachusetts Institute of Technology
+#
+# Permission to use, copy, modify, distribute, and sell this software and its
+# documentation for any purpose is hereby granted without fee, provided that
+# the above copyright notice appear in all copies and that both that
+# copyright notice and this permission notice appear in supporting
+# documentation, and that the name of M.I.T. not be used in advertising or
+# publicity pertaining to distribution of the software without specific,
+# written prior permission.  M.I.T. makes no representations about the
+# suitability of this software for any purpose.  It is provided "as is"
+# without express or implied warranty.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.  It can only install one file at a time, a restriction
+# shared with many OS's install programs.
+
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit="${DOITPROG-}"
+
+
+# put in absolute paths if you don't have them in your path; or use env. vars.
+
+mvprog="${MVPROG-mv}"
+cpprog="${CPPROG-cp}"
+chmodprog="${CHMODPROG-chmod}"
+chownprog="${CHOWNPROG-chown}"
+chgrpprog="${CHGRPPROG-chgrp}"
+stripprog="${STRIPPROG-strip}"
+rmprog="${RMPROG-rm}"
+mkdirprog="${MKDIRPROG-mkdir}"
+
+transformbasename=""
+transform_arg=""
+instcmd="$mvprog"
+chmodcmd="$chmodprog 0755"
+chowncmd=""
+chgrpcmd=""
+stripcmd=""
+rmcmd="$rmprog -f"
+mvcmd="$mvprog"
+src=""
+dst=""
+dir_arg=""
+
+while [ x"$1" != x ]; do
+    case $1 in
+	-c) instcmd="$cpprog"
+	    shift
+	    continue;;
+
+	-d) dir_arg=true
+	    shift
+	    continue;;
+
+	-m) chmodcmd="$chmodprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-o) chowncmd="$chownprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-g) chgrpcmd="$chgrpprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-s) stripcmd="$stripprog"
+	    shift
+	    continue;;
+
+	-t=*) transformarg=`echo $1 | sed 's/-t=//'`
+	    shift
+	    continue;;
+
+	-b=*) transformbasename=`echo $1 | sed 's/-b=//'`
+	    shift
+	    continue;;
+
+	*)  if [ x"$src" = x ]
+	    then
+		src=$1
+	    else
+		# this colon is to work around a 386BSD /bin/sh bug
+		:
+		dst=$1
+	    fi
+	    shift
+	    continue;;
+    esac
+done
+
+if [ x"$src" = x ]
+then
+	echo "install:	no input file specified"
+	exit 1
+else
+	true
+fi
+
+if [ x"$dir_arg" != x ]; then
+	dst=$src
+	src=""
+	
+	if [ -d $dst ]; then
+		instcmd=:
+		chmodcmd=""
+	else
+		instcmd=mkdir
+	fi
+else
+
+# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
+# might cause directories to be created, which would be especially bad 
+# if $src (and thus $dsttmp) contains '*'.
+
+	if [ -f $src -o -d $src ]
+	then
+		true
+	else
+		echo "install:  $src does not exist"
+		exit 1
+	fi
+	
+	if [ x"$dst" = x ]
+	then
+		echo "install:	no destination specified"
+		exit 1
+	else
+		true
+	fi
+
+# If destination is a directory, append the input filename; if your system
+# does not like double slashes in filenames, you may need to add some logic
+
+	if [ -d $dst ]
+	then
+		dst="$dst"/`basename $src`
+	else
+		true
+	fi
+fi
+
+## this sed command emulates the dirname command
+dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
+
+# Make sure that the destination directory exists.
+#  this part is taken from Noah Friedman's mkinstalldirs script
+
+# Skip lots of stat calls in the usual case.
+if [ ! -d "$dstdir" ]; then
+defaultIFS='	
+'
+IFS="${IFS-${defaultIFS}}"
+
+oIFS="${IFS}"
+# Some sh's can't handle IFS=/ for some reason.
+IFS='%'
+set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
+IFS="${oIFS}"
+
+pathcomp=''
+
+while [ $# -ne 0 ] ; do
+	pathcomp="${pathcomp}${1}"
+	shift
+
+	if [ ! -d "${pathcomp}" ] ;
+        then
+		$mkdirprog "${pathcomp}"
+	else
+		true
+	fi
+
+	pathcomp="${pathcomp}/"
+done
+fi
+
+if [ x"$dir_arg" != x ]
+then
+	$doit $instcmd $dst &&
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
+else
+
+# If we're going to rename the final executable, determine the name now.
+
+	if [ x"$transformarg" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		dstfile=`basename $dst $transformbasename | 
+			sed $transformarg`$transformbasename
+	fi
+
+# don't allow the sed command to completely eliminate the filename
+
+	if [ x"$dstfile" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		true
+	fi
+
+# Make a temp file name in the proper directory.
+
+	dsttmp=$dstdir/#inst.$$#
+
+# Move or copy the file name to the temp name
+
+	$doit $instcmd $src $dsttmp &&
+
+	trap "rm -f ${dsttmp}" 0 &&
+
+# and set any options; do chmod last to preserve setuid bits
+
+# If any of these fail, we abort the whole thing.  If we want to
+# ignore errors from any of these, just make sure not to ignore
+# errors from the above "$doit $instcmd $src $dsttmp" command.
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
+
+# Now rename the file to the real destination.
+
+	$doit $rmcmd -f $dstdir/$dstfile &&
+	$doit $mvcmd $dsttmp $dstdir/$dstfile 
+
+fi &&
+
+
+exit 0
diff --git a/libfec/int.h b/libfec/int.h
new file mode 100644
index 0000000..46e865d
--- /dev/null
+++ b/libfec/int.h
@@ -0,0 +1,22 @@
+/* Stuff specific to the general (integer) version of the Reed-Solomon codecs
+ *
+ * Copyright 2003, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+typedef unsigned int data_t;
+
+#define MODNN(x) modnn(rs,x)
+
+#define MM (rs->mm)
+#define NN (rs->nn)
+#define ALPHA_TO (rs->alpha_to) 
+#define INDEX_OF (rs->index_of)
+#define GENPOLY (rs->genpoly)
+#define NROOTS (rs->nroots)
+#define FCR (rs->fcr)
+#define PRIM (rs->prim)
+#define IPRIM (rs->iprim)
+#define PAD (rs->pad)
+#define A0 (NN)
+
+
diff --git a/libfec/lesser.txt b/libfec/lesser.txt
new file mode 100644
index 0000000..b1e3f5a
--- /dev/null
+++ b/libfec/lesser.txt
@@ -0,0 +1,504 @@
+		  GNU LESSER GENERAL PUBLIC LICENSE
+		       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+		  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+  
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+			    NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
+
+
diff --git a/libfec/libfec.pc.in b/libfec/libfec.pc.in
new file mode 100644
index 0000000..c569da9
--- /dev/null
+++ b/libfec/libfec.pc.in
@@ -0,0 +1,13 @@
+prefix=@LIBFEC_PC_PREFIX@
+exec_prefix=@LIBFEC_PC_EXEC_PREFIX@
+libdir=@LIBFEC_PC_LIBDIR@
+includedir=@LIBFEC_PC_INCLUDEDIR@
+
+Name: FEC library
+Description: A fork of KA9Q's FEC library
+Version: @LIBFEC_PC_VERSION@
+URL: http://opendigitalradio.org
+Cflags: -I${includedir}/ @LIBFEC_PC_CFLAGS@
+Libs: -L${libdir}/ @LIBFEC_PC_LIBS@
+Libs.private: @LIBFEC_PC_PRIV_LIBS@
+
diff --git a/libfec/makefile.in b/libfec/makefile.in
new file mode 100644
index 0000000..cc116ab
--- /dev/null
+++ b/libfec/makefile.in
@@ -0,0 +1,249 @@
+# Makefile prototype for configure
+# Copyright 2004 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+# @configure_input@
+srcdir = @srcdir@
+prefix = @prefix@
+exec_prefix=@exec_prefix@
+VPATH = @srcdir@
+CC=@CC@
+LIBS=@MLIBS@ fec.o sim.o viterbi27.o viterbi27_port.o viterbi29.o viterbi29_port.o \
+	viterbi39.o viterbi39_port.o \
+	viterbi615.o viterbi615_port.o encode_rs_char.o encode_rs_int.o encode_rs_8.o \
+	decode_rs_char.o decode_rs_int.o decode_rs_8.o \
+	init_rs_char.o init_rs_int.o ccsds_tab.o \
+	encode_rs_ccsds.o decode_rs_ccsds.o ccsds_tal.o \
+	dotprod.o dotprod_port.o \
+	peakval.o peakval_port.o \
+	sumsq.o sumsq_port.o
+
+CFLAGS=@CFLAGS@ -I. -fPIC -Wall @ARCH_OPTION@
+
+SHARED_LIB=@SH_LIB@
+
+all: libfec.a $(SHARED_LIB)
+	
+
+test: vtest27 vtest29 vtest39 vtest615 rstest dtest sumsq_test peaktest
+	@echo "Correctness tests:"
+	./vtest27 -e 3.0 -n 1000 -v
+	./vtest29 -e 2.5 -n 1000 -v
+	./vtest39 -e 2.5 -n 1000 -v
+	./vtest615 -e 1.0 -n 100 -v
+	./rstest
+	./dtest
+	./sumsq_test
+	./peaktest
+	@echo "Speed tests:"
+	./vtest27
+	./vtest29
+	./vtest39
+	./vtest615
+
+install: all
+	mkdir -p $(DESTDIR)@libdir@ 
+	install -m 644 -p $(SHARED_LIB) libfec.a $(DESTDIR)@libdir@
+#	(cd $(DESTDIR)@libdir@;ln -f -s $(SHARED_LIB) libfec.so)
+	@REBIND@
+	mkdir -p $(DESTDIR)@includedir@
+	install -m 644 -p fec.h $(DESTDIR)@includedir@
+	mkdir -m 0755 -p $(DESTDIR)@mandir@/man3
+	install -m 644 -p simd-viterbi.3 rs.3 dsp.3 $(DESTDIR)@mandir@/man3
+
+peaktest: peaktest.o libfec.a
+	gcc $(CFLAGS) -g -o $@ $^
+
+sumsq_test: sumsq_test.o libfec.a
+	gcc $(CFLAGS) -g -o $@ $^
+
+dtest: dtest.o libfec.a
+	gcc $(CFLAGS) -g -o $@ $^ -lm
+
+vtest27: vtest27.o libfec.a
+	gcc $(CFLAGS) -g -o $@ $^ -lm
+
+vtest29: vtest29.o libfec.a
+	gcc $(CFLAGS) -g -o $@ $^ -lm
+
+vtest39: vtest39.o libfec.a
+	gcc $(CFLAGS) -g -o $@ $^ -lm
+
+vtest615: vtest615.o libfec.a
+	gcc $(CFLAGS) -g -o $@ $^ -lm
+
+rstest: rstest.o libfec.a
+	gcc $(CFLAGS) -g -o $@ $^
+
+rs_speedtest: rs_speedtest.o libfec.a
+	gcc $(CFLAGS) -g -o $@ $^	
+
+# for some reason, the test programs without args segfault on the PPC with -O2 optimization. Dunno why - compiler bug?
+vtest27.o: vtest27.c fec.h
+	gcc $(CFLAGS) -g -c $<
+
+vtest29.o: vtest29.c fec.h
+	gcc $(CFLAGS) -g -c $<
+
+vtest39.o: vtest39.c fec.h
+	gcc $(CFLAGS) -g -c $<
+
+vtest615.o: vtest615.c fec.h
+	gcc $(CFLAGS) -g -c $<
+
+libfec.a: $(LIBS)
+	ar rv $@ $^
+	ranlib libfec.a
+
+# for Darwin
+libfec.dylib: $(LIBS)
+	$(CC) -dynamiclib -install_name $@ -o $@ $^
+
+# for Linux et al
+libfec.so: $(LIBS)
+	gcc -fPIC -shared -Xlinker -soname=$@ -o $@ -Wl,-whole-archive $^ -Wl,-no-whole-archive -lc -lm
+
+dotprod.o: dotprod.c fec.h
+
+dotprod_port.o: dotprod_port.c fec.h
+
+viterbi27.o: viterbi27.c fec.h
+
+viterbi27_port.o: viterbi27_port.c fec.h
+
+viterbi29.o: viterbi29.c fec.h
+
+viterbi39.o: viterbi39.c fec.h
+
+viterbi39_port.o: viterbi39_port.c fec.h
+
+viterbi39_sse2.o: viterbi39_sse2.c fec.h
+
+viterbi39_sse.o: viterbi39_sse.c fec.h
+
+viterbi39_mmx.o: viterbi39_mmx.c fec.h
+
+encode_rs_char.o: encode_rs_char.c char.h rs-common.h
+
+encode_rs_int.o: encode_rs_int.c int.h rs-common.h
+
+encode_rs_8.o: encode_rs_8.c fixed.h
+
+encode_rs_av.o: encode_rs_av.c fixed.h
+
+decode_rs_char.o: decode_rs_char.c char.h rs-common.h
+
+decode_rs_int.o: decode_rs_int.c int.h rs-common.h
+
+decode_rs_8.o: decode_rs_8.c fixed.h
+
+init_rs_char.o: init_rs_char.c char.h rs-common.h
+
+init_rs_int.o: init_rs_int.c int.h rs-common.h
+
+ccsds_tab.o: ccsds_tab.c
+
+ccsds_tab.c: gen_ccsds
+	./gen_ccsds > ccsds_tab.c
+
+gen_ccsds: gen_ccsds.o init_rs_char.o
+	gcc $(CFLAGS) -o $@ $^
+
+gen_ccsds.o: gen_ccsds.c
+	gcc  $(CFLAGS) -c -o $@ $<
+
+ccsds_tal.o: ccsds_tal.c
+
+ccsds_tal.c: gen_ccsds_tal
+	./gen_ccsds_tal > ccsds_tal.c
+
+exercise_char.o: exercise.c
+	gcc $(CFLAGS) -c -o $@ $<
+
+exercise_int.o: exercise.c
+	gcc -DBIGSYM=1 $(CFLAGS) -c -o $@ $<
+
+exercise_8.o: exercise.c
+	gcc -DFIXED=1 $(CFLAGS) -c -o $@ $<
+
+exercise_ccsds.o: exercise.c
+	gcc -DCCSDS=1 $(CFLAGS) -c -o $@ $<
+
+viterbi27.o: viterbi27.c fec.h
+
+viterbi27_port.o: viterbi27_port.c fec.h
+
+viterbi27_av.o: viterbi27_av.c fec.h
+
+viterbi27_mmx.o: viterbi27_mmx.c fec.h
+	gcc $(CFLAGS) -mmmx -c -o $@ $<
+
+viterbi27_sse.o: viterbi27_sse.c fec.h
+	gcc $(CFLAGS) -msse -c -o $@ $<
+
+viterbi27_sse2.o: viterbi27_sse2.c fec.h
+	gcc $(CFLAGS) -msse2 -c -o $@ $<
+
+viterbi29.o: viterbi29.c fec.h
+
+viterbi29_port.o: viterbi29_port.c fec.h
+
+viterbi29_av.o: viterbi29_av.c fec.h
+
+viterbi29_mmx.o: viterbi29_mmx.c fec.h
+	gcc $(CFLAGS) -mmmx -c -o $@ $<
+
+viterbi29_sse.o: viterbi29_sse.c fec.h
+	gcc $(CFLAGS) -msse -c -o $@ $<
+
+viterbi29_sse2.o: viterbi29_sse2.c fec.h
+	gcc $(CFLAGS) -msse2 -c -o $@ $<
+
+viterbi39.o: viterbi39.c fec.h
+
+viterbi39_port.o: viterbi39_port.c fec.h
+
+viterbi39_av.o: viterbi39_av.c fec.h
+
+viterbi39_mmx.o: viterbi39_mmx.c fec.h
+	gcc $(CFLAGS) -mmmx -c -o $@ $<
+
+viterbi39_sse.o: viterbi39_sse.c fec.h
+	gcc $(CFLAGS) -msse -c -o $@ $<
+
+viterbi39_sse2.o: viterbi39_sse2.c fec.h
+	gcc $(CFLAGS) -msse2 -c -o $@ $<
+
+viterbi615.o: viterbi615.c fec.h
+
+viterbi615_port.o: viterbi615_port.c fec.h
+
+viterbi615_av.o: viterbi615_av.c fec.h
+
+viterbi615_mmx.o: viterbi615_mmx.c fec.h
+	gcc $(CFLAGS) -mmmx -c -o $@ $<
+
+viterbi615_sse.o: viterbi615_sse.c fec.h
+	gcc $(CFLAGS) -msse -c -o $@ $<
+
+viterbi615_sse2.o: viterbi615_sse2.c fec.h
+	gcc $(CFLAGS) -msse2 -c -o $@ $<
+
+cpu_mode_x86.o: cpu_mode_x86.c fec.h
+
+cpu_mode_x86_64.o: cpu_mode_x86_64.c fec.h
+
+cpu_mode_ppc.o: cpu_mode_ppc.c fec.h
+
+#%.o:   %.s
+#	$(AS) $< -o $@
+
+
+
+clean:
+	rm -f *.o $(SHARED_LIB) *.a rs_speedtest peaktest sumsq_test dtest vtest27 vtest29 vtest39 vtest615 rstest ccsds_tab.c ccsds_tal.c gen_ccsds gen_ccsds_tal core
+	rm -rf autom4te.cache
+
+distclean: clean
+	rm -f config.log config.cache config.status config.h makefile
+
diff --git a/libfec/mmxbfly27.s b/libfec/mmxbfly27.s
new file mode 100644
index 0000000..4abbf48
--- /dev/null
+++ b/libfec/mmxbfly27.s
@@ -0,0 +1,148 @@
+/* Intel SIMD MMX implementation of Viterbi ACS butterflies
+   for 64-state (k=7) convolutional code
+   Copyright 2004 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   int update_viterbi27_blk_mmx(struct v27 *vp,unsigned char *syms,int nbits) ; 
+*/
+	# MMX (64-bit SIMD) version
+	# requires Pentium-MMX, Pentium-II or better
+
+	# These are offsets into struct v27, defined in viterbi27_mmx.c
+	.set DP,128
+	.set OLDMETRICS,132
+	.set NEWMETRICS,136
+	.text	
+	.global update_viterbi27_blk_mmx,Mettab27_1,Mettab27_2
+	.type update_viterbi27_blk_mmx,@function
+	.align 16
+	
+update_viterbi27_blk_mmx:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+	
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+
+	movl 12(%ebp),%ebx	# ebx = syms
+	movw (%ebx),%ax		# ax = second symbol : first symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	movb %ah,%bl
+	andl $255,%eax
+	andl $255,%ebx
+
+	# shift into first array index dimension slot
+	shll $5,%eax
+	shll $5,%ebx
+
+	# each invocation of this macro will do 8 butterflies in parallel
+	.MACRO butterfly GROUP
+	# Compute branch metrics
+	movq (Mettab27_1+8*\GROUP)(%eax),%mm3
+	movq fifteens,%mm0	
+
+	paddb (Mettab27_2+8*\GROUP)(%ebx),%mm3
+	paddb ones,%mm3  # emulate pavgb - this may not be necessary
+	psrlq $1,%mm3
+	pand %mm0,%mm3
+
+	movq (8*\GROUP)(%esi),%mm6	# Incoming path metric, high bit = 0
+	movq ((8*\GROUP)+32)(%esi),%mm2 # Incoming path metric, high bit = 1
+	movq %mm6,%mm1	
+	movq %mm2,%mm7
+	
+	paddb %mm3,%mm6
+	paddb %mm3,%mm2
+	pxor  %mm0,%mm3		 # invert branch metric
+	paddb %mm3,%mm7		 # path metric for inverted symbols
+	paddb %mm3,%mm1
+
+	# live registers 1 2 6 7
+	# Compare mm6 and mm7;  mm1 and mm2
+	pxor %mm3,%mm3	
+	movq %mm6,%mm4
+	movq %mm1,%mm5	
+	psubb %mm7,%mm4		# mm4 = mm6 - mm7
+	psubb %mm2,%mm5		# mm5 = mm1 - mm2
+	pcmpgtb %mm3,%mm4	# mm4 = first set of decisions (ff = 1 better)
+	pcmpgtb %mm3,%mm5	# mm5 = second set of decisions		
+
+	# live registers 1 2 4 5 6 7
+	# select survivors
+	movq %mm4,%mm0
+	pand %mm4,%mm7	
+	movq %mm5,%mm3	
+	pand %mm5,%mm2	
+	pandn %mm6,%mm0
+	pandn %mm1,%mm3	
+	por %mm0,%mm7		# mm7 = first set of survivors
+	por %mm3,%mm2		# mm2 = second set of survivors	
+
+	# live registers 2 4 5 7
+	# interleave & store decisions in mm4, mm5
+	# interleave & store new branch metrics in mm2, mm7		
+	movq %mm4,%mm3
+	movq %mm7,%mm0	
+	punpckhbw %mm5,%mm4
+	punpcklbw %mm5,%mm3
+	punpcklbw %mm2,%mm7	# interleave second 8 new metrics
+	punpckhbw %mm2,%mm0	# interleave first 8 new metrics
+	movq %mm4,(16*\GROUP+8)(%edx)
+	movq %mm3,(16*\GROUP)(%edx)
+	movq %mm7,(16*\GROUP)(%edi)
+	movq %mm0,(16*\GROUP+8)(%edi)	
+
+	.endm
+
+# invoke macro 4 times for a total of 32 butterflies
+	butterfly GROUP=0
+	butterfly GROUP=1
+	butterfly GROUP=2
+	butterfly GROUP=3
+
+	addl $64,%edx		# bump decision pointer			
+
+	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+
+2:	emms
+	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+	ret
+
+	.data
+	.align 8
+fifteens:	
+	.byte 15,15,15,15,15,15,15,15
+	
+	.align 8
+ones:	.byte 1,1,1,1,1,1,1,1
diff --git a/libfec/mmxbfly29.s b/libfec/mmxbfly29.s
new file mode 100644
index 0000000..e37cab8
--- /dev/null
+++ b/libfec/mmxbfly29.s
@@ -0,0 +1,161 @@
+/* Intel SIMD MMX implementation of Viterbi ACS butterflies
+   for 256-state (k=9) convolutional code
+   Copyright 2004 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   void update_viterbi29_blk_mmx(struct v29 *vp,unsigned char *syms,int nbits); 
+*/
+
+	# These are offsets into struct v29, defined in viterbi29.h
+	.set DP,512
+	.set OLDMETRICS,516
+	.set NEWMETRICS,520
+	.text	
+	.global update_viterbi29_blk_mmx,Mettab29_1,Mettab29_2
+	.type update_viterbi29_blk_mmx,@function
+	.align 16
+	
+	# MMX (64-bit SIMD) version
+	# requires Pentium-MMX, Pentium-II or better
+
+update_viterbi29_blk_mmx:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+	
+	movl 8(%ebp),%edx	# edx = vp
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+
+	movl 12(%ebp),%ebx	# ebx = syms
+	movw (%ebx),%ax		# ax = second symbol : first symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	movb %ah,%bl
+	andl $255,%eax
+	andl $255,%ebx
+	
+	# shift into first array index dimension slot
+	shll $7,%eax
+	shll $7,%ebx
+
+	# each invocation of this macro will do 8 butterflies in parallel
+	.MACRO butterfly GROUP
+	# Compute branch metrics
+	movq (Mettab29_1+8*\GROUP)(%eax),%mm3
+	movq fifteens,%mm0	
+	paddb (Mettab29_2+8*\GROUP)(%ebx),%mm3
+	paddb ones,%mm3  # emulate pavgb - this may not be necessary
+	psrlq $1,%mm3
+	pand %mm0,%mm3
+
+	movq (8*\GROUP)(%esi),%mm6	# Incoming path metric, high bit = 0
+	movq ((8*\GROUP)+128)(%esi),%mm2 # Incoming path metric, high bit = 1
+	movq %mm6,%mm1	
+	movq %mm2,%mm7
+	
+	paddb %mm3,%mm6
+	paddb %mm3,%mm2
+	pxor  %mm0,%mm3		 # invert branch metric
+	paddb %mm3,%mm7		 # path metric for inverted symbols
+	paddb %mm3,%mm1
+
+	# live registers 1 2 6 7
+	# Compare mm6 and mm7;  mm1 and mm2
+	pxor %mm3,%mm3	
+	movq %mm6,%mm4
+	movq %mm1,%mm5	
+	psubb %mm7,%mm4		# mm4 = mm6 - mm7
+	psubb %mm2,%mm5		# mm5 = mm1 - mm2
+	pcmpgtb %mm3,%mm4	# mm4 = first set of decisions (ff = 1 better)
+	pcmpgtb %mm3,%mm5	# mm5 = second set of decisions		
+
+	# live registers 1 2 4 5 6 7
+	# select survivors
+	movq %mm4,%mm0
+	pand %mm4,%mm7	
+	movq %mm5,%mm3	
+	pand %mm5,%mm2	
+	pandn %mm6,%mm0
+	pandn %mm1,%mm3	
+	por %mm0,%mm7		# mm7 = first set of survivors
+	por %mm3,%mm2		# mm2 = second set of survivors	
+
+	# live registers 2 4 5 7
+	# interleave & store decisions in mm4, mm5
+	# interleave & store new branch metrics in mm2, mm7		
+	movq %mm4,%mm3
+	movq %mm7,%mm0	
+	punpckhbw %mm5,%mm4
+	punpcklbw %mm5,%mm3
+	punpcklbw %mm2,%mm7	# interleave second 8 new metrics
+	punpckhbw %mm2,%mm0	# interleave first 8 new metrics
+	movq %mm4,(16*\GROUP+8)(%edx)
+	movq %mm3,(16*\GROUP)(%edx)
+	movq %mm7,(16*\GROUP)(%edi)
+	movq %mm0,(16*\GROUP+8)(%edi)	
+
+	.endm
+
+# invoke macro 16 times for a total of 128 butterflies
+	butterfly GROUP=0
+	butterfly GROUP=1
+	butterfly GROUP=2
+	butterfly GROUP=3
+	butterfly GROUP=4
+	butterfly GROUP=5
+	butterfly GROUP=6
+	butterfly GROUP=7
+	butterfly GROUP=8
+	butterfly GROUP=9
+	butterfly GROUP=10
+	butterfly GROUP=11
+	butterfly GROUP=12
+	butterfly GROUP=13
+	butterfly GROUP=14
+	butterfly GROUP=15
+
+	addl $256,%edx		# bump decision pointer			
+
+	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+
+2:	emms
+	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+	ret
+
+	.data
+	.align 8
+fifteens:	
+	.byte 15,15,15,15,15,15,15,15
+
+	.align 8
+ones:	.byte 1,1,1,1,1,1,1,1
diff --git a/libfec/peak_mmx_assist.s b/libfec/peak_mmx_assist.s
new file mode 100644
index 0000000..dae831f
--- /dev/null
+++ b/libfec/peak_mmx_assist.s
@@ -0,0 +1,70 @@
+# MMX assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+	.text
+
+# Find peak value in signed 16-bit input samples
+#  int peakval_mmx(signed short *in,int cnt);	
+	.global peakval_mmx
+	.type peakval_mmx,@function
+	.align 16
+peakval_mmx:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+	pushl %ebx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %mm7,%mm7		# clear peak
+	
+1:	subl $4,%ecx
+	jl 2f
+	movq (%esi),%mm0
+	movq %mm0,%mm1	
+	psraw $15,%mm1		# mm1 = 1's if negative, 0's if positive
+	pxor %mm1,%mm0		# complement negatives
+	psubw %mm1,%mm0		# add 1 to negatives
+	movq %mm7,%mm6		# copy previous peak
+	pcmpgtw %mm0,%mm6	# ff == old peak greater
+	pand %mm6,%mm7		# select old peaks that are greater
+	pandn %mm0,%mm6		# select new values that are greater
+	por %mm6,%mm7
+	
+	addl $8,%esi
+	jmp 1b	
+
+2:	movd %mm7,%eax
+	psrlq $16,%mm7
+	andl $0xffff,%eax
+	
+	movd %mm7,%edx
+	psrlq $16,%mm7
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl  3f
+	movl %edx,%eax
+3:		
+	movd %mm7,%edx
+	psrlq $16,%mm7
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl 4f
+	movl %edx,%eax
+4:		
+	movd %mm7,%edx
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl 5f
+	movl %edx,%eax
+5:	
+	emms
+	popl %ebx
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
+	
diff --git a/libfec/peak_sse2_assist.s b/libfec/peak_sse2_assist.s
new file mode 100644
index 0000000..1dee3a8
--- /dev/null
+++ b/libfec/peak_sse2_assist.s
@@ -0,0 +1,51 @@
+# SSE2 assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Public License (GPL)
+
+	.text
+
+# Find peak absolute value in signed 16-bit input samples
+#  int peakval_sse2(signed short *in,int cnt);
+	.global peakval_sse2
+	.type peakval_sse2,@function
+	.align 16
+peakval_sse2:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %xmm7,%xmm7	# clear peak
+	
+1:	subl $8,%ecx
+	jl 2f
+	movaps (%esi),%xmm0
+	movaps %xmm0,%xmm1	
+	psraw $15,%xmm1		# xmm1 = 1's if negative, 0's if positive
+	pxor %xmm1,%xmm0	# complement negatives
+	psubw %xmm1,%xmm0	# add 1 to negatives
+	pmaxsw %xmm0,%xmm7	# store peak
+	
+	addl $16,%esi
+	jmp 1b
+
+2:	movaps %xmm7,%xmm0
+	psrldq $8,%xmm0
+	pmaxsw %xmm0,%xmm7
+	movaps %xmm7,%xmm0
+	psrlq $32,%xmm0
+	pmaxsw %xmm0,%xmm7
+	movaps %xmm7,%xmm0
+	psrlq $16,%xmm0
+	pmaxsw %xmm0,%xmm7	# min value in low word of %xmm7
+	
+	movd %xmm7,%eax
+	andl $0xffff,%eax
+
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
diff --git a/libfec/peak_sse_assist.s b/libfec/peak_sse_assist.s
new file mode 100644
index 0000000..ea6fce8
--- /dev/null
+++ b/libfec/peak_sse_assist.s
@@ -0,0 +1,49 @@
+# SSE assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+	.text
+
+# Find peak absolute value in signed 16-bit input samples
+#  int peakval_sse(signed short *in,int cnt);
+	.global peakval_sse
+	.type peakval_sse,@function
+	.align 16
+peakval_sse:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %mm7,%mm7		# clear peak
+	
+1:	subl $4,%ecx
+	jl 2f
+	movq (%esi),%mm0
+	movq %mm0,%mm1	
+	psraw $15,%mm1		# mm1 = 1's if negative, 0's if positive
+	pxor %mm1,%mm0		# complement negatives
+	psubw %mm1,%mm0		# add 1 to negatives
+	pmaxsw %mm0,%mm7	# store peak
+	
+	addl $8,%esi
+	jmp 1b	
+
+2:	movq %mm7,%mm0
+	psrlq $32,%mm0
+	pmaxsw %mm0,%mm7
+	movq %mm7,%mm0
+	psrlq $16,%mm0
+	pmaxsw %mm0,%mm7	# min value in low word of %mm7
+	
+	movd %mm7,%eax
+	andl $0xffff,%eax
+
+	emms
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
diff --git a/libfec/peaktest.c b/libfec/peaktest.c
new file mode 100644
index 0000000..fa4b280
--- /dev/null
+++ b/libfec/peaktest.c
@@ -0,0 +1,38 @@
+/* Verify correctness of the peak routine
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+/* These values should trigger leading/trailing array fragment handling */
+#define NSAMP 200002
+#define OFFSET 1
+
+int peakval(signed short *,int);
+int peakval_port(signed short *,int);
+
+int main(){
+  int i,s;
+  int result,rresult;
+  signed short samples[NSAMP];
+
+  srandom(time(NULL));
+
+  for(i=0;i<NSAMP;i++){
+    do {
+      s = random() & 0x0fff;
+    } while(s == 0x8000);
+    samples[i] = s;
+  }
+  samples[5] = 25000;
+
+  rresult = peakval_port(&samples[OFFSET],NSAMP-OFFSET);
+  result = peakval(&samples[OFFSET],NSAMP-OFFSET);
+  if(result == rresult){
+    printf("OK\n");
+  } else {
+    printf("peak mismatch: %d != %d\n",result,rresult);
+  }
+  exit(0);
+}
diff --git a/libfec/peakval.c b/libfec/peakval.c
new file mode 100644
index 0000000..2105a44
--- /dev/null
+++ b/libfec/peakval.c
@@ -0,0 +1,50 @@
+/* Switch to appropriate version of peakval routine
+ * Copyright 2004, Phil Karn, KA9Q
+ */
+
+#include <stdlib.h>
+#include "fec.h"
+
+int peakval_port(signed short *b,int cnt);
+#ifdef __i386__
+int peakval_mmx(signed short *b,int cnt);
+int peakval_sse(signed short *b,int cnt);
+int peakval_sse2(signed short *b,int cnt);
+#endif
+
+#ifdef __x86_64__
+int peakval_sse2(signed short *b,int cnt);
+#endif
+
+#ifdef __VEC__
+int peakval_av(signed short *b,int cnt);
+#endif
+
+int peakval(signed short *b,int cnt){
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return peakval_port(b,cnt);
+#ifdef __i386__
+  case MMX:
+    return peakval_mmx(b,cnt);
+  case SSE:
+    return peakval_sse(b,cnt);
+  case SSE2:
+    return peakval_sse2(b,cnt);
+#endif
+
+#ifdef __x86_64__
+  case SSE2:
+    return peakval_port(b,cnt);
+    //return peakval_sse2(b,cnt);
+#endif
+
+#ifdef __VEC__
+  case ALTIVEC:
+    return peakval_av(b,cnt);
+#endif
+  }
+}
diff --git a/libfec/peakval_av.c b/libfec/peakval_av.c
new file mode 100644
index 0000000..ae54c10
--- /dev/null
+++ b/libfec/peakval_av.c
@@ -0,0 +1,61 @@
+/* Return the largest absolute value of a vector of signed shorts
+
+ * This is the Altivec SIMD version.
+
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#include "fec.h"
+
+signed short peakval_av(signed short *in,int cnt){
+  vector signed short x;
+  int pad;
+  union { vector signed char cv; vector signed short hv; signed short s[8]; signed char c[16];} s;
+  vector signed short smallest,largest;
+
+  smallest = (vector signed short)(0);
+  largest = (vector signed short)(0);
+  if((pad = (int)in & 15)!=0){
+    /* Load unaligned leading word */
+    x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in));
+    if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */
+      s.c[15] = (8-cnt)<<4;
+      x = vec_sro(x,s.cv);
+    }
+    smallest = vec_min(smallest,x);
+    largest = vec_max(largest,x);
+    in += 8-pad/2;
+    cnt -= 8-pad/2;
+  }
+  /* Everything is now aligned, rip through most of the block */
+  while(cnt >= 8){
+    x = vec_ld(0,in);
+    smallest = vec_min(smallest,x);
+    largest = vec_max(largest,x);
+    in += 8;
+    cnt -= 8;
+  }
+  /* Handle trailing fragment, if any */
+  if(cnt > 0){
+    x = vec_ld(0,in);
+    s.c[15] = (8-cnt)<<4;
+    x = vec_sro(x,s.cv);
+    smallest = vec_min(smallest,x);
+    largest = vec_max(largest,x);
+  }
+  /* Combine and extract result */
+  largest = vec_max(largest,vec_abs(smallest));
+
+  s.c[15] = 64; /* Shift right four 16-bit words */
+  largest = vec_max(largest,vec_sro(largest,s.cv));
+
+  s.c[15] = 32; /* Shift right two 16-bit words */
+  largest = vec_max(largest,vec_sro(largest,s.cv));
+
+  s.c[15] = 16; /* Shift right one 16-bit word */
+  largest = vec_max(largest,vec_sro(largest,s.cv));
+
+  s.hv = largest;
+  return s.s[7];
+}
diff --git a/libfec/peakval_mmx.c b/libfec/peakval_mmx.c
new file mode 100644
index 0000000..436fe88
--- /dev/null
+++ b/libfec/peakval_mmx.c
@@ -0,0 +1,34 @@
+/* Wrapper for the MMX version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+
+#include <stdlib.h>
+
+int peakval_mmx_assist(signed short *,int);
+
+int peakval_mmx(signed short *b,int cnt){
+  int peak = 0;
+  int a;
+
+  while(((int)b & 7) != 0 && cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  a = peakval_mmx_assist(b,cnt);
+  if(a > peak)
+    peak = a;
+  b += cnt & ~3;
+  cnt &= 3;
+
+  while(cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  return peak;
+}  
diff --git a/libfec/peakval_mmx_assist.s b/libfec/peakval_mmx_assist.s
new file mode 100644
index 0000000..553cb79
--- /dev/null
+++ b/libfec/peakval_mmx_assist.s
@@ -0,0 +1,70 @@
+# MMX assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+	.text
+
+# Find peak value in signed 16-bit input samples
+#  int peakval_mmx_assist(signed short *in,int cnt);	
+	.global peakval_mmx_assist
+	.type peakval_mmx_assist,@function
+	.align 16
+peakval_mmx_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+	pushl %ebx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %mm7,%mm7		# clear peak
+	
+1:	subl $4,%ecx
+	jl 2f
+	movq (%esi),%mm0
+	movq %mm0,%mm1	
+	psraw $15,%mm1		# mm1 = 1's if negative, 0's if positive
+	pxor %mm1,%mm0		# complement negatives
+	psubw %mm1,%mm0		# add 1 to negatives
+	movq %mm7,%mm6		# copy previous peak
+	pcmpgtw %mm0,%mm6	# ff == old peak greater
+	pand %mm6,%mm7		# select old peaks that are greater
+	pandn %mm0,%mm6		# select new values that are greater
+	por %mm6,%mm7
+	
+	addl $8,%esi
+	jmp 1b	
+
+2:	movd %mm7,%eax
+	psrlq $16,%mm7
+	andl $0xffff,%eax
+	
+	movd %mm7,%edx
+	psrlq $16,%mm7
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl  3f
+	movl %edx,%eax
+3:		
+	movd %mm7,%edx
+	psrlq $16,%mm7
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl 4f
+	movl %edx,%eax
+4:		
+	movd %mm7,%edx
+	andl $0xffff,%edx
+	cmpl %edx,%eax
+	jnl 5f
+	movl %edx,%eax
+5:	
+	emms
+	popl %ebx
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
+	
diff --git a/libfec/peakval_port.c b/libfec/peakval_port.c
new file mode 100644
index 0000000..07ab316
--- /dev/null
+++ b/libfec/peakval_port.c
@@ -0,0 +1,16 @@
+/* Portable C version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdlib.h>
+#include "fec.h"
+int peakval_port(signed short *b,int len){
+  int peak = 0;
+  int a,i;
+
+  for(i=0;i<len;i++){
+    a = abs(b[i]);
+    if(a > peak)
+      peak = a;
+  }
+  return peak;
+}
diff --git a/libfec/peakval_sse.c b/libfec/peakval_sse.c
new file mode 100644
index 0000000..9868b7f
--- /dev/null
+++ b/libfec/peakval_sse.c
@@ -0,0 +1,35 @@
+/* IA-32 SSE version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+
+#include <stdlib.h>
+#include "fec.h"
+
+int peakval_sse_assist(signed short *,int);
+
+int peakval_sse(signed short *b,int cnt){
+  int peak = 0;
+  int a;
+
+  while(((int)b & 7) != 0 && cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  a = peakval_sse_assist(b,cnt);
+  if(a > peak)
+    peak = a;
+  b += cnt & ~3;
+  cnt &= 3;
+
+  while(cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  return peak;
+}  
diff --git a/libfec/peakval_sse2.c b/libfec/peakval_sse2.c
new file mode 100644
index 0000000..79d9059
--- /dev/null
+++ b/libfec/peakval_sse2.c
@@ -0,0 +1,34 @@
+/* Portable C version of peakval
+ * Copyright 2004 Phil Karn, KA9Q
+ */
+#include <stdlib.h>
+#include "fec.h"
+
+int peakval_sse2_assist(signed short *,int);
+
+int peakval_sse2(signed short *b,int cnt){
+  int peak = 0;
+  int a;
+
+  while(((int)b & 15) != 0 && cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  a = peakval_sse2_assist(b,cnt);
+  if(a > peak)
+    peak = a;
+  b += cnt & ~7;
+  cnt &= 7;
+
+  while(cnt != 0){
+    a = abs(*b);
+    if(a > peak)
+      peak = a;
+    b++;
+    cnt--;
+  }
+  return peak;
+}  
diff --git a/libfec/peakval_sse2_assist.s b/libfec/peakval_sse2_assist.s
new file mode 100644
index 0000000..c7a58e7
--- /dev/null
+++ b/libfec/peakval_sse2_assist.s
@@ -0,0 +1,51 @@
+# SSE2 assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+	.text
+
+# Find peak absolute value in signed 16-bit input samples
+#  int peakval_sse2_assist(signed short *in,int cnt);
+	.global peakval_sse2_assist
+	.type peakval_sse2_assist,@function
+	.align 16
+peakval_sse2_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %xmm7,%xmm7	# clear peak
+	
+1:	subl $8,%ecx
+	jl 2f
+	movaps (%esi),%xmm0
+	movaps %xmm0,%xmm1	
+	psraw $15,%xmm1		# xmm1 = 1's if negative, 0's if positive
+	pxor %xmm1,%xmm0	# complement negatives
+	psubw %xmm1,%xmm0	# add 1 to negatives
+	pmaxsw %xmm0,%xmm7	# store peak
+	
+	addl $16,%esi
+	jmp 1b
+
+2:	movaps %xmm7,%xmm0
+	psrldq $8,%xmm0
+	pmaxsw %xmm0,%xmm7
+	movaps %xmm7,%xmm0
+	psrlq $32,%xmm0
+	pmaxsw %xmm0,%xmm7
+	movaps %xmm7,%xmm0
+	psrlq $16,%xmm0
+	pmaxsw %xmm0,%xmm7	# min value in low word of %xmm7
+	
+	movd %xmm7,%eax
+	andl $0xffff,%eax
+
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
diff --git a/libfec/peakval_sse_assist.s b/libfec/peakval_sse_assist.s
new file mode 100644
index 0000000..827c800
--- /dev/null
+++ b/libfec/peakval_sse_assist.s
@@ -0,0 +1,49 @@
+# SSE assist routines for peakval
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Lesser General Public License (LGPL)
+
+	.text
+
+# Find peak absolute value in signed 16-bit input samples
+#  int peakval_sse_assist(signed short *in,int cnt);
+	.global peakval_sse_assist
+	.type peakval_sse_assist,@function
+	.align 16
+peakval_sse_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+
+	pxor %mm7,%mm7		# clear peak
+	
+1:	subl $4,%ecx
+	jl 2f
+	movq (%esi),%mm0
+	movq %mm0,%mm1	
+	psraw $15,%mm1		# mm1 = 1's if negative, 0's if positive
+	pxor %mm1,%mm0		# complement negatives
+	psubw %mm1,%mm0		# add 1 to negatives
+	pmaxsw %mm0,%mm7	# store peak
+	
+	addl $8,%esi
+	jmp 1b	
+
+2:	movq %mm7,%mm0
+	psrlq $32,%mm0
+	pmaxsw %mm0,%mm7
+	movq %mm7,%mm0
+	psrlq $16,%mm0
+	pmaxsw %mm0,%mm7	# min value in low word of %mm7
+	
+	movd %mm7,%eax
+	andl $0xffff,%eax
+
+	emms
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
diff --git a/libfec/rs-common.h b/libfec/rs-common.h
new file mode 100644
index 0000000..e64eb39
--- /dev/null
+++ b/libfec/rs-common.h
@@ -0,0 +1,26 @@
+/* Stuff common to all the general-purpose Reed-Solomon codecs
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+/* Reed-Solomon codec control block */
+struct rs {
+  int mm;              /* Bits per symbol */
+  int nn;              /* Symbols per block (= (1<<mm)-1) */
+  data_t *alpha_to;     /* log lookup table */
+  data_t *index_of;     /* Antilog lookup table */
+  data_t *genpoly;      /* Generator polynomial */
+  int nroots;     /* Number of generator roots = number of parity symbols */
+  int fcr;        /* First consecutive root, index form */
+  int prim;       /* Primitive element, index form */
+  int iprim;      /* prim-th root of 1, index form */
+  int pad;        /* Padding bytes in shortened block */
+};
+
+static inline int modnn(struct rs *rs,int x){
+  while (x >= rs->nn) {
+    x -= rs->nn;
+    x = (x >> rs->mm) + (x & rs->nn);
+  }
+  return x;
+}
diff --git a/libfec/rs.3 b/libfec/rs.3
new file mode 100644
index 0000000..5d71503
--- /dev/null
+++ b/libfec/rs.3
@@ -0,0 +1,198 @@
+.TH REED-SOLOMON 3
+.SH NAME
+init_rs_int, encode_rs_int, decode_rs_int, free_rs_int,
+init_rs_char, encode_rs_char, decode_rs_char, free_rs_char,
+encode_rs_8, decode_rs_8, encode_rs_ccsds, decode_rs_ccsds
+\- Reed-Solomon encoding/decoding
+.SH SYNOPSIS
+.nf
+.ft B
+#include "fec.h"
+
+void *init_rs_int(int symsize,int gfpoly,int fcr,int prim,
+     int nroots,int pad);
+
+void encode_rs_int(void *rs,int *data,int *parity);
+
+int decode_rs_int(void *rs,int *data,int *eras_pos,int no_eras);
+
+void free_rs_int(void *rs);
+
+
+void *init_rs_char(int symsize,int gfpoly,int fcr,int prim,
+     int nroots,int pad);
+
+void encode_rs_char(void *rs,unsigned char *data,
+     unsigned char *parity);
+
+int decode_rs_char(void *rs,unsigned char *data,int *eras_pos,
+     int no_eras);
+
+void free_rs_char(void *rs);
+
+
+void encode_rs_8(unsigned char *data,unsigned char *parity,
+     int pad);
+
+int decode_rs_8(unsigned char *data,int *eras_pos,int no_eras,
+     int pad);
+
+
+void encode_rs_ccsds(unsigned char *data,unsigned char *parity,
+     int pad);
+
+int decode_rs_ccsds(unsigned char *data,int *eras_pos,int no_eras,
+     int pad);
+
+unsigned char Taltab[256];
+unsigned char Tal1tab[256];
+
+.fi
+
+.SH DESCRIPTION
+These functions implement Reed-Solomon error control encoding and
+decoding. For optimal performance in a variety of applications, three
+sets of functions are supplied. To access these functions, add "-lfec"
+to your linker command line.
+
+The functions with names ending in \fB_int\fR handle data in integer arrays,
+permitting arbitrarily large codewords limited only by machine
+resources.
+
+The functions with names ending in \fB_char\fR take unsigned char arrays and can
+handle codes with symbols of 8 bits or less (i.e., with codewords of
+255 symbols or less).
+
+\fBencode_rs_8\fR and \fBdecode_rs_8\fR implement a specific
+(255,223) code with 8-bit symbols specified by the CCSDS:
+a field generator of 1 + X + X^2 + X^7 + X^8 and a code
+generator with first consecutive root = 112 and a primitive element of
+11. These functions use the conventional
+polynomial form, \fInot\fR the dual-basis specified in
+the CCSDS standard, to represent symbols. This code may be
+shortened by giving a non-zero \fBpad\fR value to produce a
+(255-\fBpad\fR,223-\fBpad\fR) code. The padding will consist of the
+specified number of zeroes at the front of the full codeword.
+
+For full CCSDS compatibility, \fBencode_rs_ccsds\fR and
+\fBdecode_rs_ccsds\fR are provided. These functions use two lookup
+tables, \fBTaltab\fR to convert from conventional to dual-basis, and
+\fBTal1tab\fR to perform the inverse mapping from dual-basis to
+conventional form, before and after calls to \fBencode_rs_8\fR
+and \fBdecode_rs_8\fR.
+
+The \fB_8\fR and \fB_ccsds\fR functions do not require initialization.
+
+To use the general purpose RS encoder or decoder (i.e.,
+the \fB_char\fR or \fB_int\fR versions), the user must first
+call \fBinit_rs_int\fR or \fBinit_rs_char\fR as appropriate. The
+arguments are as follows:
+
+\fBsymsize\fR gives the symbol size in bits, up to 8 for \fBinit_rs_char\fR
+or 32 for \fBinit_rs_int\fR on a machine with 32-bit ints (though such a
+huge code would exhaust memory limits on a 32-bit machine). The resulting
+Reed-Solomon code word will have 2^\fBsymsize\fR - 1 symbols,
+each containing \fBsymsize\fR bits. The codeword may be shortened with the
+\fBpad\fR parameter described below.
+
+\fBgfpoly\fR gives the extended Galois field generator polynomial coefficients,
+with the 0th coefficient in the low order bit. The polynomial
+\fImust\fR be primitive; if not, the call will fail and NULL will be
+returned.
+
+\fBfcr\fR gives, in index form, the first consecutive root of the
+Reed Solomon code generator polynomial.
+
+\fBprim\fR gives, in index form, the primitive element in the Galois field
+used to generate the Reed Solomon code generator polynomial.
+
+\fBnroots\fR gives the number of roots in the Reed Solomon code
+generator polynomial. This equals the number of parity symbols
+per code block.
+
+\fBpad\fR gives the number of leading symbols in the codeword
+that are implicitly padded to zero in a shortened code block. 
+
+The resulting Reed-Solomon code has parameters (N,K), where
+N = 2^\fBsymsize\fR - \fBpad\fR - 1 and K = N-\fBnroots\fR.
+
+The \fBencode_rs_char\fR and \fBencode_rs_int\fR functions accept
+the pointer returned by \fBinit_rs_char\fR or
+\fBinit_rs_int\fR, respectively, to
+encode a block of data using the specified code.
+The input data array is expected to
+contain K symbols (of \fBsymsize\fR bits each, right justified
+in each char or int) and \fBnroots\fR parity symbols will be placed
+into the \fBparity\fR array, right justified.
+
+The \fBdecode_\fR functions correct
+the errors in a Reed-Solomon codeword of N symbols up to the capability of the code.
+An optional list of "erased" symbol indices may be given in the \fBeras_pos\fR
+array to assist the decoder; this parameter may be NULL if no erasures
+are given. The number of erased symbols must be given in the \fBno_eras\fR
+parameter.
+
+To maximize performance, the encode and decode functions perform no
+"sanity checking" of their inputs. Decoder failure may result if
+\fBeras_pos\fR contains duplicate entries, and both encoder and
+decoder will fail if an input symbol exceeds its allowable range.
+(Symbol range overflow cannot occur with the \fB_8\fR or
+\fB_ccsds\fR functions,
+or with the \fB_char\fR functions when 8-bit symbols are specified.)
+
+The decoder corrects the symbols "in place", returning the number
+of symbols in error. If the codeword is uncorrectable, -1 is returned
+and the data block is unchanged. If \fBeras_pos\fR is non-null, it is
+used to return a list of corrected symbol positions, in no particular
+order.  This means that the
+array passed through this parameter \fImust\fR have at least \fBnroots\fR
+elements to prevent a possible buffer overflow.
+
+The \fBfree_rs_int\fR and \fBfree_rs_char\fR functions free the internal
+space allocated by the \fBinit_rs_int\fR and \fBinit_rs_char\fR functions,
+respecitively.
+
+The functions \fBencode_rs_8\fR and \fBdecode_rs_8\fR do not have
+corresponding \fBinit\fR and \fBfree\fR, nor do they take the
+\fBrs\fR argument accepted by the other functions as their parameters
+are statically compiled. These functions implement a code
+equivalent to calling
+
+\fBinit_rs_char\fR(8,0x187,112,11,32,pad);
+
+and using the resulting pointer with \fBencode_rs_char\fR and
+\fBdecode_rs_char\fR.
+
+.SH RETURN VALUES
+\fBinit_rs_int\fR and \fBinit_rs_char\fR return a pointer to an internal
+control structure that must be passed to the corresponding encode, decode
+and free functions. These functions return NULL on error.
+
+The \fBdecode_\fR functions return a count of corrected
+symbols, or -1 if the block was uncorrectible.
+
+.SH AUTHOR
+Phil Karn, KA9Q (karn@ka9q.net), based heavily on earlier work by Robert
+Morelos-Zaragoza (robert@spectra.eng.hawaii.edu) and Hari Thirumoorthy
+(harit@spectra.eng.hawaii.edu). Extra improvements suggested by Detmar
+Welz (dwelz@web.de).
+
+.SH COPYRIGHT
+Copyright 2004, Phil Karn, KA9Q. May be used under the terms of the
+GNU Lesser General Public License (LGPL).
+
+.SH SEE ALSO
+CCSDS 101.0-B-6: Telemetry Channel Coding.
+http://www.ccsds.org/documents/101x0b6.pdf
+
+.SH NOTE
+CCSDS chose the "dual basis" symbol representation because it
+simplified the implementation of a Reed-Solomon encoder in dedicated
+hardware. However, this approach holds no advantages for a software
+implementation on a general purpose computer, so use of the dual basis
+is recommended only if compatibility with the CCSDS standard is needed,
+e.g., to decode data from an existing spacecraft using the CCSDS
+standard. If you just want a fast (255,223) RS codec without needing
+to interoperate with a CCSDS standard code, use \fBencode_rs_8\fR
+and \fBdecode_rs_8\fR.
+
diff --git a/libfec/rs_speedtest.c b/libfec/rs_speedtest.c
new file mode 100644
index 0000000..225f160
--- /dev/null
+++ b/libfec/rs_speedtest.c
@@ -0,0 +1,54 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include "fec.h"
+
+int main(){
+  unsigned char block[255];
+  int i;
+  void *rs;
+  struct rusage start,finish;
+  double extime;
+  int trials = 10000;
+
+  for(i=0;i<223;i++)
+    block[i] = 0x01;
+
+  rs = init_rs_char(8,0x187,112,11,32,0);
+  encode_rs_char(rs,block,&block[223]);
+
+  getrusage(RUSAGE_SELF,&start);
+  for(i=0;i<trials;i++){
+#if 0
+    block[0] ^= 0xff; /* Introduce an error */
+    block[2] ^= 0xff; /* Introduce an error */
+#endif
+    decode_rs_char(rs,block,NULL,0);
+  }
+  getrusage(RUSAGE_SELF,&finish);
+  extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+  
+  printf("Execution time for %d Reed-Solomon blocks using general decoder: %.2f sec\n",trials,extime);
+  printf("decoder speed: %g bits/s\n",trials*223*8/extime);
+
+
+  encode_rs_8(block,&block[223],0);
+  getrusage(RUSAGE_SELF,&start);
+  for(i=0;i<trials;i++){
+#if 0
+    block[0] ^= 0xff; /* Introduce an error */
+    block[2] ^= 0xff; /* Introduce an error */
+#endif
+    decode_rs_8(block,NULL,0,0);
+  }
+  getrusage(RUSAGE_SELF,&finish);
+  extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+  printf("Execution time for %d Reed-Solomon blocks using CCSDS decoder: %.2f sec\n",trials,extime);
+  printf("decoder speed: %g bits/s\n",trials*223*8/extime);
+
+  exit(0);
+}
+
diff --git a/libfec/rstest.c b/libfec/rstest.c
new file mode 100644
index 0000000..539b40a
--- /dev/null
+++ b/libfec/rstest.c
@@ -0,0 +1,296 @@
+/* Test the Reed-Solomon codecs
+ * for various block sizes and with random data and random error patterns
+ *
+ * Copyright 2002 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <time.h>
+#include "fec.h"
+
+
+struct etab {
+  int symsize;
+  int genpoly;
+  int fcs;
+  int prim;
+  int nroots;
+  int ntrials;
+} Tab[] = {
+  {2, 0x7,     1,   1, 1, 10 },
+  {3, 0xb,     1,   1, 2, 10 },
+  {4, 0x13,    1,   1, 4, 10 },
+  {5, 0x25,    1,   1, 6, 10 },
+  {6, 0x43,    1,   1, 8, 10 },
+  {7, 0x89,    1,   1, 10, 10 },
+  {8, 0x11d,   1,   1, 32, 10 },
+  {8, 0x187,   112,11, 32, 10 }, /* Duplicates CCSDS codec */
+  {9, 0x211,   1,   1, 32, 10 },
+  {10,0x409,   1,   1, 32, 10 },
+  {11,0x805,   1,   1, 32, 10 },
+  {12,0x1053,  1,   1, 32, 5 },
+  {13,0x201b,  1,   1, 32, 2 },
+  {14,0x4443,  1,   1, 32, 1 },
+  {15,0x8003,  1,   1, 32, 1 },
+  {16,0x1100b, 1,   1, 32, 1 },
+  {0, 0, 0, 0, 0},
+};
+
+int exercise_char(struct etab *e);
+int exercise_int(struct etab *e);
+int exercise_8(void);
+
+int main(){
+  int i;
+
+  srandom(time(NULL));
+
+  printf("Testing fixed CCSDS encoder...\n");
+  exercise_8();
+  for(i=0;Tab[i].symsize != 0;i++){
+    int nn,kk;
+
+    nn = (1<<Tab[i].symsize) - 1;
+    kk = nn - Tab[i].nroots;
+    printf("Testing (%d,%d) code...\n",nn,kk);
+    if(Tab[i].symsize <= 8)
+      exercise_char(&Tab[i]);
+    else
+      exercise_int(&Tab[i]);
+  }
+  exit(0);
+}
+
+int exercise_8(void){
+  int nn = 255;
+  unsigned char block[nn],tblock[nn];
+  int errlocs[nn],derrlocs[nn];
+  int i;
+  int errors;
+  int derrors,kk;
+  int errval,errloc;
+  int erasures;
+  int decoder_errors = 0;
+
+  /* Compute code parameters */
+  kk = 223;
+
+
+  /* Test up to the error correction capacity of the code */
+  for(errors=0;errors<=(nn-kk)/2;errors++){
+
+    /* Load block with random data and encode */
+    for(i=0;i<kk;i++)
+      block[i] = random() & nn;
+    memcpy(tblock,block,sizeof(block));
+    encode_rs_8(block,&block[kk],0);
+
+    /* Make temp copy, seed with errors */
+    memcpy(tblock,block,sizeof(block));
+    memset(errlocs,0,sizeof(errlocs));
+    memset(derrlocs,0,sizeof(derrlocs));
+    erasures=0;
+    for(i=0;i<errors;i++){
+      do {
+	errval = random() & nn;
+      } while(errval == 0); /* Error value must be nonzero */
+      
+      do {
+	errloc = random() % nn;
+      } while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+      
+      errlocs[errloc] = 1;
+
+#if FLAG_ERASURE
+      if(random() & 1) /* 50-50 chance */
+	derrlocs[erasures++] = errloc;
+#endif
+      tblock[errloc] ^= errval;
+    }
+
+    /* Decode the errored block */
+    derrors = decode_rs_8(tblock,derrlocs,erasures,0);
+
+    if(derrors != errors){
+	printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors);
+	decoder_errors++;
+    }
+    for(i=0;i<derrors;i++){
+      if(errlocs[derrlocs[i]] == 0){
+	printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]);
+	decoder_errors++;
+      }
+    }
+    if(memcmp(tblock,block,sizeof(tblock)) != 0){
+      printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk);
+      decoder_errors++;
+      for(i=0;i<nn;i++)
+	printf(" %02x",tblock[i] ^ block[i]);
+      printf("\n");
+    }
+  }
+  return decoder_errors;
+}
+
+
+int exercise_char(struct etab *e){
+  int nn = (1<<e->symsize) - 1;
+  unsigned char block[nn],tblock[nn];
+  int errlocs[nn],derrlocs[nn];
+  int i;
+  int errors;
+  int derrors,kk;
+  int errval,errloc;
+  int erasures;
+  int decoder_errors = 0;
+  void *rs;
+
+  if(e->symsize > 8)
+    return -1;
+
+  /* Compute code parameters */
+  kk = nn - e->nroots;
+
+  rs = init_rs_char(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0);
+  if(rs == NULL){
+    printf("init_rs_char failed!\n");
+    return -1;
+  }
+  /* Test up to the error correction capacity of the code */
+  for(errors=0;errors <= e->nroots/2;errors++){
+
+    /* Load block with random data and encode */
+    for(i=0;i<kk;i++)
+      block[i] = random() & nn;
+    memcpy(tblock,block,sizeof(block));
+    encode_rs_char(rs,block,&block[kk]);
+
+    /* Make temp copy, seed with errors */
+    memcpy(tblock,block,sizeof(block));
+    memset(errlocs,0,sizeof(errlocs));
+    memset(derrlocs,0,sizeof(derrlocs));
+    erasures=0;
+    for(i=0;i<errors;i++){
+      do {
+	errval = random() & nn;
+      } while(errval == 0); /* Error value must be nonzero */
+      
+      do {
+	errloc = random() % nn;
+      } while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+      
+      errlocs[errloc] = 1;
+
+#if FLAG_ERASURE
+      if(random() & 1) /* 50-50 chance */
+	derrlocs[erasures++] = errloc;
+#endif
+      tblock[errloc] ^= errval;
+    }
+
+    /* Decode the errored block */
+    derrors = decode_rs_char(rs,tblock,derrlocs,erasures);
+
+    if(derrors != errors){
+	printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors);
+	decoder_errors++;
+    }
+    for(i=0;i<derrors;i++){
+      if(errlocs[derrlocs[i]] == 0){
+	printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]);
+	decoder_errors++;
+      }
+    }
+    if(memcmp(tblock,block,sizeof(tblock)) != 0){
+      printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk);
+      decoder_errors++;
+      for(i=0;i<nn;i++)
+	printf(" %02x",tblock[i] ^ block[i]);
+      printf("\n");
+    }
+  }
+
+  free_rs_char(rs);
+  return 0;
+}
+
+int exercise_int(struct etab *e){
+  int nn = (1<<e->symsize) - 1;
+  int block[nn],tblock[nn];
+  int errlocs[nn],derrlocs[nn];
+  int i;
+  int errors;
+  int derrors,kk;
+  int errval,errloc;
+  int erasures;
+  int decoder_errors = 0;
+  void *rs;
+
+  /* Compute code parameters */
+  kk = nn - e->nroots;
+
+  rs = init_rs_int(e->symsize,e->genpoly,e->fcs,e->prim,e->nroots,0);
+  if(rs == NULL){
+    printf("init_rs_int failed!\n");
+    return -1;
+  }
+  /* Test up to the error correction capacity of the code */
+  for(errors=0;errors <= e->nroots/2;errors++){
+
+    /* Load block with random data and encode */
+    for(i=0;i<kk;i++)
+      block[i] = random() & nn;
+    memcpy(tblock,block,sizeof(block));
+    encode_rs_int(rs,block,&block[kk]);
+
+    /* Make temp copy, seed with errors */
+    memcpy(tblock,block,sizeof(block));
+    memset(errlocs,0,sizeof(errlocs));
+    memset(derrlocs,0,sizeof(derrlocs));
+    erasures=0;
+    for(i=0;i<errors;i++){
+      do {
+	errval = random() & nn;
+      } while(errval == 0); /* Error value must be nonzero */
+      
+      do {
+	errloc = random() % nn;
+      } while(errlocs[errloc] != 0); /* Must not choose the same location twice */
+      
+      errlocs[errloc] = 1;
+
+#if FLAG_ERASURE
+      if(random() & 1) /* 50-50 chance */
+	derrlocs[erasures++] = errloc;
+#endif
+      tblock[errloc] ^= errval;
+    }
+
+    /* Decode the errored block */
+    derrors = decode_rs_int(rs,tblock,derrlocs,erasures);
+
+    if(derrors != errors){
+	printf("(%d,%d) decoder says %d errors, true number is %d\n",nn,kk,derrors,errors);
+	decoder_errors++;
+    }
+    for(i=0;i<derrors;i++){
+      if(errlocs[derrlocs[i]] == 0){
+	printf("(%d,%d) decoder indicates error in location %d without error\n",nn,kk,derrlocs[i]);
+	decoder_errors++;
+      }
+    }
+    if(memcmp(tblock,block,sizeof(tblock)) != 0){
+      printf("(%d,%d) decoder uncorrected errors! output ^ input:",nn,kk);
+      decoder_errors++;
+      for(i=0;i<nn;i++)
+	printf(" %02x",tblock[i] ^ block[i]);
+      printf("\n");
+    }
+  }
+
+  free_rs_int(rs);
+  return 0;
+}
diff --git a/libfec/sim.c b/libfec/sim.c
new file mode 100644
index 0000000..151b04c
--- /dev/null
+++ b/libfec/sim.c
@@ -0,0 +1,43 @@
+#include <math.h>
+#include <stdlib.h>
+#include "fec.h"
+
+#define	MAX_RANDOM	0x7fffffff
+
+/* Generate gaussian random double with specified mean and std_dev */
+double normal_rand(double mean, double std_dev)
+{
+  double fac,rsq,v1,v2;
+  static double gset;
+  static int iset;
+
+  if(iset){
+    /* Already got one */
+    iset = 0;
+    return mean + std_dev*gset;
+  }
+  /* Generate two evenly distributed numbers between -1 and +1
+   * that are inside the unit circle
+   */
+  do {
+    v1 = 2.0 * (double)random() / MAX_RANDOM - 1;
+    v2 = 2.0 * (double)random() / MAX_RANDOM - 1;
+    rsq = v1*v1 + v2*v2;
+  } while(rsq >= 1.0 || rsq == 0.0);
+  fac = sqrt(-2.0*log(rsq)/rsq);
+  gset = v1*fac;
+  iset++;
+  return mean + std_dev*v2*fac;
+}
+
+unsigned char addnoise(int sym,double amp,double gain,double offset,int clip){
+  int sample;
+    
+  sample = offset + gain*normal_rand(sym?amp:-amp,1.0);
+  /* Clip to 8-bit offset range */
+  if(sample < 0)
+    sample = 0;
+  else if(sample > clip)
+    sample = clip;
+  return sample;
+}
diff --git a/libfec/simd-viterbi.3 b/libfec/simd-viterbi.3
new file mode 100644
index 0000000..4c67593
--- /dev/null
+++ b/libfec/simd-viterbi.3
@@ -0,0 +1,247 @@
+.TH SIMD-VITERBI 3
+.SH NAME
+create_viterbi27, set_viterbi27_polynomial, init_viterbi27, update_viterbi27_blk,
+chainback_viterbi27, delete_viterbi27,
+create_viterbi29, set_viterbi_29_polynomial, init_viterbi29, update_viterbi29_blk,
+chainback_viterbi29, delete_viterbi29,
+create_viterbi39, set_viterbi_39_polynomial, init_viterbi39, update_viterbi39_blk,
+chainback_viterbi39, delete_viterbi39,
+create_viterbi615, set_viterbi615_polynomial, init_viterbi615, update_viterbi615_blk,
+chainback_viterbi615, delete_viterbi615 -\ IA32 SIMD-assisted Viterbi decoders
+.SH SYNOPSIS
+.nf
+.ft B
+#include "fec.h"
+void *create_viterbi27(int blocklen);
+void set_viterbi27_polynomial(int polys[2]);
+int init_viterbi27(void *vp,int starting_state);
+int update_viterbi27_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi27(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi27(void *vp);
+.fi
+.sp
+.nf
+.ft B
+void *create_viterbi29(int blocklen);
+void set_viterbi29_polynomial(int polys[2]);
+int init_viterbi29(void *vp,int starting_state);
+int update_viterbi29_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi29(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi29(void *vp);
+.fi
+.sp
+.nf
+.ft B
+void *create_viterbi39(int blocklen);
+void set_viterbi39_polynomial(int polys[3]);
+int init_viterbi39(void *vp,int starting_state);
+int update_viterbi39_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi39(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi39(void *vp);
+.fi
+.sp
+.nf
+.ft B
+void *create_viterbi615(int blocklen);
+void set_viterbi615_polynomial(int polys[6]);
+int init_viterbi615(void *vp,int starting_state);
+int update_viterbi615_blk(void *vp,unsigned char syms[],int nbits);
+int chainback_viterbi615(void *vp, unsigned char *data,unsigned int nbits,unsigned int endstate);
+void delete_viterbi615(void *vp);
+.fi
+.SH DESCRIPTION
+These functions implement high performance Viterbi decoders for four
+convolutional codes: a rate 1/2 constraint length 7 (k=7) code
+("viterbi27"), a rate 1/2 k=9 code ("viterbi29"),
+a rate 1/3 k=9 code ("viterbi39") and a rate 1/6 k=15 code ("viterbi615").
+The decoders use the Intel IA32 or PowerPC SIMD instruction sets, if available, to improve
+decoding speed.
+
+On the IA32 there are three different SIMD instruction sets. The first
+and most common is MMX, introduced on later Intel Pentiums and then on
+the Intel Pentium II and most Intel clones (AMD K6, Transmeta Crusoe,
+etc).  SSE was introduced on the Pentium III and later implemented in
+the AMD Athlon 4 (AMD calls it "3D Now!  Professional"). Most
+recently, SSE2 was introduced in the Intel Pentium 4, and has been
+adopted by more recent AMD CPUs. The presence of SSE2 implies the
+existence of SSE, which in turn implies MMX.
+
+Altivec is the PowerPC SIMD instruction set. It is roughly comparable
+to SSE2. Altivec was introduced to the general public in the Apple
+Macintosh G4; it is also present in the G5. Altivec is actually a
+Motorola trademark; Apple calls it "Velocity Engine" and IBM calls it
+"VMX". All refer to the same thing.
+
+When built for the IA32 or PPC architectures, the functions
+automatically use the most powerful SIMD instruction set available. If
+no SIMD instructions are available, or if the library is built for a
+non-IA32, non-PPC machine, a portable C version is executed
+instead.
+
+.SH USAGE
+Four versions of each function are provided, one for each code.
+In the following discussion, change "viterbi" to "viterbi27", "viterbi29", "viterbi39"
+or "viterbi615" as desired. 
+
+Before Viterbi decoding can begin, an instance must first be created with
+\fBcreate_viterbi()\fR.  This function creates and returns a pointer to
+an internal control structure
+containing the path metrics and the branch
+decisions. \fBcreate_viterbi()\fR takes one argument that gives the
+length of the data block in bits. You \fImust not\fR attempt to
+decode a block longer than the length given to \fBcreate_viterbi()\fR.
+
+Before decoding a new frame,
+\fBinit_viterbi()\fR must be called to reset the decoder state.
+It accepts the instance pointer returned by
+\fBcreate_viterbi()\fR and the initial starting state of the
+convolutional encoder (usually 0). If the initial starting state is unknown or
+incorrect, the decoder will still function but the decoded data may be
+incorrect at the start of the block.
+
+Blocks of received symbols are processed with calls to
+\fBupdate_viterbi_blk()\fR.  The \fBnbits\fR parameter specifies the
+number of \fIdata bits\fR (not channel symbols) represented by the
+\fBsyms\fR buffer. (For rate 1/2 codes, the number of symbols in
+\fBsyms\fR is twice \fInbits\fR, and so on.)
+Each symbol is expected to range
+from 0 through 255, with 0 corresponding to a "strong 0" and 255
+corresponding to a "strong 1". The caller is responsible for
+determining the proper pairing of input symbols (commonly known as
+decoder symbol phasing).
+
+At the end of the block, the data is recovered with a call to
+\fBchainback_viterbi()\fR. The arguments are the pointer to the
+decoder instance, a pointer to a user-supplied buffer into which the
+decoded data is to be written, the number of data bits (not bytes)
+that are to be decoded, and the terminal state of the convolutional
+encoder at the end of the frame (usually 0). If the terminal state is
+incorrect or unknown, the decoded data bits at the end of the frame
+may be unreliable. The decoded data is written in big-endian order,
+i.e., the first bit in the frame is written into the high order bit of
+the first byte in the buffer. If the frame is not an integral number
+of bytes long, the low order bits of the last byte in the frame will
+be unused.
+
+Note that the decoders assume the use of a tail, i.e., the encoding
+and transmission of a sufficient number of padding bits beyond the end
+of the user data to force the convolutional encoder into the known
+terminal state given to \fBchainback_viterbi()\fR. The tail is
+always one bit less than the constraint length of the code, so the k=7
+code uses 6 tail bits (12 tail symbols), the k=9 code uses 8 tail bits
+(16 tail symbols) and the k=15 code uses 14 tail bits (84 tail
+symbols).
+
+The tail bits are not included in the length arguments to
+\fBcreate_viterbi()\fR and \fBchainback_viterbi()\fR. For example, if
+the block contains 1000 user bits, then this would be the length
+parameter given to \fBcreate_viterbi27()\fR and
+\fBchainback_viterbi27()\fR, and \fBupdate_viterbi27_blk()\fR would be called
+with a total of 2012 symbols - the last 12 encoded symbols
+representing the tail bits.
+
+After the call to \fBchainback_viterbi()\fR, the decoder may be reset
+with a call to \fBinit_viterbi()\fR and another block can be decoded.
+Alternatively, \fBdelete_viterbi()\fR can be called to free all resources
+used by the Viterbi decoder.
+
+The \fBset_viterbi_polynomial()\fR function allows use of other than the default
+code generator polynomials. Although only one set of polynomials are generally
+used with each code, there can are different conventions as to their order and
+symbol polarity, and these functions simplifies their use.
+
+The default polynomials for the viterbi27 routes
+are those of the NASA-JPL convention \fIwithout\fR symbol inversion.
+The NASA-JPL convention normally inverts the first symbol.
+The CCSDS/NASA-GSFC convention swaps the two symbols and inverts the second.
+.sp
+To set the NASA-JPL convention with symbol inversion:
+.sp
+.nf
+.ft B
+int polys[2] = { -V27POLYA,V27POLYB };
+set_viterbi27_polynomial(polys);
+.ft R
+.fi
+.sp
+and to set the CCSDS convention with symbol inversion:
+.sp
+.nf
+.ft B
+int polys[2] = { V27POLYB,-V27POLYA };
+set_viterbi27_polynomial(polys);
+.ft R
+.fi
+.sp
+The default polynomials for the viterbi615 routines
+are those used by the Cassini spacecraft \fIwithout\fR
+symbol inversion. Mars Pathfinder (MPF) and STEREO
+swap the third and fourth polynomials.
+Both conventions invert the
+first, third and fifth symbols. Refer to fec.h for the polynomial constant definitions.
+.sp
+To set the Cassini convention with symbol inversion, do the following:
+
+.nf
+.ft B
+int polys[6] = { -V615POLYA,V615POLYB,-V615POLYC,V615POLYD,-V615POLYE,V615POLYF };
+set_viterbi615_polynomial(polys);
+.ft R
+.fi
+.sp
+and to set the MPF/STEREO convention with symbol inversion:
+.sp
+.nf
+.ft B
+int polys[6] = { -V615POLYA,V615POLYB,-V615POLYD,V615POLYC,-V615POLYE,V615POLYF };
+set_viterbi615_polynomial(polys);
+.ft R
+.fi
+
+For performance reasons, calling this function changes the code
+generator polynomials for \fIall\fR instances of corresponding Viterbi decoder,
+including those already created.
+
+.SH ERROR PERFORMANCE
+These decoders have all been extensively tested and found to provide
+performance consistent with that expected for soft-decision Viterbi
+decoding with 8-bit symbols.
+
+Due to internal differences, the implementations
+vary slightly in error performance. In
+general, the portable C versions exhibit the best error performance
+because they use full-sized branch metrics, and the MMX versions
+exhibit the worst because they use 8-bit branch metrics with modulo
+comparisons. The SSE, SSE2 and Altivec implementations of the r=1/2 k=7 and
+r=1/2 k=9 codes use unsigned
+8-bit branch metrics, and are almost as good as the C versions.  The
+r=1/3 k=9 and r=1/6 k=15 codes are implemented with 16-bit path metrics in all SIMD
+versions.
+
+.SH DIRECT ACCESS TO SPECIFIC FUNCTION VERSIONS
+Calling the functions listed above automatically calls the appropriate
+version of the function depending on the CPU type and available SIMD
+instructions. A particular version can also be called directly by
+appending the appropriate suffix to the function name. The available
+suffixes are "_mmx", "_sse", "_sse2", "_av" and "_port", for the MMX,
+SSE, SSE2, Altivec and portable versions, respectively. For example,
+the SSE2 version of the update_viterbi27_blk() function can be invoked
+as update_viterbi27_blk_sse2().
+
+Naturally, the _av functions are only available on the PowerPC and the
+_mmx, _sse and _sse2 versions are only available on IA-32. Calling
+a SIMD-enabled function on a CPU that doesn't support the appropriate
+set of instructions will result in an illegal instruction exception.
+
+.SH RETURN VALUES
+\fBcreate_viterbi\fR returns a pointer to the structure containing
+the decoder state. 
+The other functions return -1 on error, 0 otherwise.
+
+.SH AUTHOR & COPYRIGHT
+Phil Karn, KA9Q (karn@ka9q.net)
+
+.SH LICENSE
+This software may be used under the terms of the GNU Limited General Public License (LGPL).
+
+
diff --git a/libfec/sqtest.c b/libfec/sqtest.c
new file mode 100644
index 0000000..b2abb09
--- /dev/null
+++ b/libfec/sqtest.c
@@ -0,0 +1,42 @@
+/* Verify correctness of the sum-of-square routines */
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+/* These values should trigger leading/trailing array fragment handling */
+#define NSAMP 200002
+#define OFFSET 1
+
+long long sumsq_wq(signed short *in,int cnt);
+long long sumsq_wq_ref(signed short *in,int cnt);
+
+int main(){
+  int i;
+  long long result,rresult;
+  signed short samples[NSAMP];
+
+  srandom(time(NULL));
+
+  for(i=0;i<NSAMP;i++)
+    samples[i] = random() & 0xffff;
+
+  rresult = sumsq_wq(&samples[OFFSET],NSAMP-OFFSET);
+  result = sumsq_wq(&samples[OFFSET],NSAMP-OFFSET);
+  if(result == rresult){
+    printf("OK\n");
+  } else {
+    printf("sum mismatch: %lld != %lld\n",result,rresult);
+  }
+  exit(0);
+}
+
+long long sumsq_wq_ref(signed short *in,int cnt){
+  long long sum = 0;
+  int i;
+
+  for(i=0;i<cnt;i++){
+    sum += (long)in[i] * in[i];
+  }
+  return sum;
+}
+
diff --git a/libfec/sse2bfly27-64.s b/libfec/sse2bfly27-64.s
new file mode 100644
index 0000000..b23c6a9
--- /dev/null
+++ b/libfec/sse2bfly27-64.s
@@ -0,0 +1,210 @@
+/* Intel SIMD (SSE2) implementations of Viterbi ACS butterflies
+   for 64-state (k=7) convolutional code
+   Copyright 2003 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   Modifications for x86_64, 2012 Matthias P. Braendli, HB9EGM:
+   - changed registers to x86-64 equivalents
+   - changed instructions accordingly
+   - %rip indirect addressing needed for position independent code,
+     which is required because x86-64 needs dynamic libs to be PIC
+
+   void update_viterbi27_blk_sse2(struct v27 *vp,unsigned char syms[],int nbits) ; 
+*/
+	# SSE2 (128-bit integer SIMD) version
+    # All X86-64 CPUs include SSE2
+
+	# These are offsets into struct v27, defined in viterbi27_av.c
+	.set DP,128
+	.set OLDMETRICS,132
+	.set NEWMETRICS,136
+	.text	
+	.global update_viterbi27_blk_sse2,Branchtab27_sse2
+	.type update_viterbi27_blk_sse2,@function
+	.align 16
+	
+update_viterbi27_blk_sse2:
+	pushq %rbp
+	movq %rsp,%rbp
+    /* convention different between i386 and x86_64: rsi and rdi belong to called function, not caller */
+    /* Let's say we don't care (yet) */
+	pushq %rsi
+	pushq %rdi
+	pushq %rdx
+	pushq %rbx
+	
+	movq 8(%rbp),%rdx	# edx = vp
+	testq %rdx,%rdx
+	jnz  0f
+	movq -1,%rax
+	jmp  err		
+0:	movq OLDMETRICS(%rdx),%rsi	# esi -> old metrics
+	movq NEWMETRICS(%rdx),%rdi	# edi -> new metrics
+	movq DP(%rdx),%rdx	# edx -> decisions
+
+1:	movq 16(%rbp),%rax	# eax = nbits
+	decq %rax
+	jl   2f			# passed zero, we're done
+	movq %rax,16(%rbp)
+
+	xorq %rax,%rax
+	movq 12(%rbp),%rbx	# ebx = syms
+	movb (%rbx),%al
+	movd %rax,%xmm6		# xmm6[0] = first symbol
+	movb 1(%rbx),%al
+	movd %rax,%xmm5		# xmm5[0] = second symbol
+	addq $2,%rbx
+	movq %rbx,12(%rbp)
+
+	punpcklbw %xmm6,%xmm6	# xmm6[1] = xmm6[0]
+	punpcklbw %xmm5,%xmm5
+	pshuflw $0,%xmm6,%xmm6	# copy low word to low 3
+	pshuflw $0,%xmm5,%xmm5
+	punpcklqdq %xmm6,%xmm6  # propagate to all 16
+	punpcklqdq %xmm5,%xmm5
+	# xmm6 now contains first symbol in each byte, xmm5 the second
+
+	movdqa thirtyones(%rip),%xmm7
+	
+	# each invocation of this macro does 16 butterflies in parallel
+	.MACRO butterfly GROUP
+	# compute branch metrics
+	movdqa (Branchtab27_sse2+(16*\GROUP))(%rip),%xmm4
+	movdqa (Branchtab27_sse2+32+(16*\GROUP))(%rip),%xmm3
+	pxor %xmm6,%xmm4
+	pxor %xmm5,%xmm3
+	
+	# compute 5-bit branch metric in xmm4 by adding the individual symbol metrics
+	# This is okay for this
+	# code because the worst-case metric spread (at high Eb/No) is only 120,
+	# well within the range of our unsigned 8-bit path metrics, and even within
+	# the range of signed 8-bit path metrics
+	pavgb %xmm3,%xmm4
+	psrlw $3,%xmm4
+
+	pand %xmm7,%xmm4
+
+	movdqa (16*\GROUP)(%esi),%xmm0	# Incoming path metric, high bit = 0
+	movdqa ((16*\GROUP)+32)(%esi),%xmm3	# Incoming path metric, high bit = 1
+	movdqa %xmm0,%xmm2
+	movdqa %xmm3,%xmm1
+	paddusb %xmm4,%xmm0	# note use of saturating arithmetic
+	paddusb %xmm4,%xmm3	# this shouldn't be necessary, but why not?
+	
+	# negate branch metrics
+	pxor %xmm7,%xmm4
+	paddusb %xmm4,%xmm1
+	paddusb %xmm4,%xmm2	
+	
+	# Find survivors, leave in mm0,2
+	pminub %xmm1,%xmm0
+	pminub %xmm3,%xmm2
+	# get decisions, leave in mm1,3
+	pcmpeqb %xmm0,%xmm1
+	pcmpeqb %xmm2,%xmm3
+	
+	# interleave and store new branch metrics in mm0,2
+	movdqa %xmm0,%xmm4
+	punpckhbw %xmm2,%xmm0	# interleave second 16 new metrics
+	punpcklbw %xmm2,%xmm4	# interleave first 16 new metrics
+	movdqa %xmm0,(32*\GROUP+16)(%rdi)
+	movdqa %xmm4,(32*\GROUP)(%rdi)
+
+	# interleave decisions & store
+	movdqa %xmm1,%xmm4
+	punpckhbw %xmm3,%xmm1
+	punpcklbw %xmm3,%xmm4
+	# work around bug in gas due to Intel doc error
+	.byte 0x66,0x0f,0xd7,0xd9	# pmovmskb %xmm1,%ebx
+	shlq $16,%rbx
+	.byte 0x66,0x0f,0xd7,0xc4	# pmovmskb %xmm4,%eax
+	orq %rax,%rbx
+	movq %rbx,(4*\GROUP)(%rdx)
+	.endm
+
+	# invoke macro 2 times for a total of 32 butterflies
+	butterfly GROUP=0
+	butterfly GROUP=1
+
+	addq $8,%rdx		# bump decision pointer
+		
+	# See if we have to normalize. This requires an explanation. We don't want
+	# our path metrics to exceed 255 on the *next* iteration. Since the
+	# largest branch metric is 30, that means we don't want any to exceed 225
+	# on *this* iteration. Rather than look them all, we just pick an arbitrary one
+	# (the first) and see if it exceeds 225-120=105, where 120 is the experimentally-
+	# determined worst-case metric spread for this code and branch metrics in the range 0-30.
+	
+	# This is extremely conservative, and empirical testing at a variety of Eb/Nos might
+	# show that a higher threshold could be used without affecting BER performance
+	movq (%rdi),%rax	# extract first output metric
+	andq $255,%rax
+	cmp $105,%rax
+	jle done		# No, no need to normalize
+
+	# Normalize by finding smallest metric and subtracting it
+	# from all metrics. We can't just pick an arbitrary small constant because
+	# the minimum metric might be zero!
+	movdqa (%rdi),%xmm0
+	movdqa %xmm0,%xmm4	
+	movdqa 16(%rdi),%xmm1
+	pminub %xmm1,%xmm4
+	movdqa 32(%rdi),%xmm2
+	pminub %xmm2,%xmm4	
+	movdqa 48(%rdi),%xmm3	
+	pminub %xmm3,%xmm4
+
+	# crunch down to single lowest metric
+	movdqa %xmm4,%xmm5
+	psrldq $8,%xmm5     # the count to psrldq is bytes, not bits!
+	pminub %xmm5,%xmm4
+	movdqa %xmm4,%xmm5
+	psrlq $32,%xmm5
+	pminub %xmm5,%xmm4
+	movdqa %xmm4,%xmm5
+	psrlq $16,%xmm5
+	pminub %xmm5,%xmm4
+	movdqa %xmm4,%xmm5
+	psrlq $8,%xmm5
+	pminub %xmm5,%xmm4	# now in lowest byte of %xmm4
+
+	punpcklbw %xmm4,%xmm4	# lowest 2 bytes
+	pshuflw $0,%xmm4,%xmm4  # lowest 8 bytes
+	punpcklqdq %xmm4,%xmm4	# all 16 bytes
+	
+	# xmm4 now contains lowest metric in all 16 bytes
+	# subtract it from every output metric
+	psubusb %xmm4,%xmm0
+	psubusb %xmm4,%xmm1
+	psubusb %xmm4,%xmm2
+	psubusb %xmm4,%xmm3	
+	movdqa %xmm0,(%rdi)
+	movdqa %xmm1,16(%rdi)	
+	movdqa %xmm2,32(%rdi)	
+	movdqa %xmm3,48(%rdi)	
+	
+done:		
+	# swap metrics
+	movq %rsi,%rax
+	movq %rdi,%rsi
+	movq %rax,%rdi
+	jmp 1b
+	
+2:	movq 8(%rbp),%rbx	# ebx = vp
+	# stash metric pointers
+	movq %rsi,OLDMETRICS(%rbx)
+	movq %rdi,NEWMETRICS(%rbx)
+	movq %rdx,DP(%rbx)	# stash incremented value of vp->dp
+	xorq %rax,%rax
+err:	popq %rbx
+	popq %rdx
+	popq %rdi
+	popq %rsi
+	popq %rbp
+	ret
+
+	.data
+	.align 16
+
+thirtyones:
+	.byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
diff --git a/libfec/sse2bfly27.s b/libfec/sse2bfly27.s
new file mode 100644
index 0000000..27422a2
--- /dev/null
+++ b/libfec/sse2bfly27.s
@@ -0,0 +1,202 @@
+/* Intel SIMD (SSE2) implementations of Viterbi ACS butterflies
+   for 64-state (k=7) convolutional code
+   Copyright 2003 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   void update_viterbi27_blk_sse2(struct v27 *vp,unsigned char syms[],int nbits) ; 
+*/
+	# SSE2 (128-bit integer SIMD) version
+	# Requires Pentium 4 or better
+
+	# These are offsets into struct v27, defined in viterbi27.h
+	.set DP,128
+	.set OLDMETRICS,132
+	.set NEWMETRICS,136
+	.text	
+	.global update_viterbi27_blk_sse2,Branchtab27_sse2
+	.type update_viterbi27_blk_sse2,@function
+	.align 16
+	
+update_viterbi27_blk_sse2:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+	
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+
+	xorl %eax,%eax
+	movl 12(%ebp),%ebx	# ebx = syms
+	movb (%ebx),%al
+	movd %eax,%xmm6		# xmm6[0] = first symbol
+	movb 1(%ebx),%al
+	movd %eax,%xmm5		# xmm5[0] = second symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	punpcklbw %xmm6,%xmm6	# xmm6[1] = xmm6[0]
+	punpcklbw %xmm5,%xmm5
+	pshuflw $0,%xmm6,%xmm6	# copy low word to low 3
+	pshuflw $0,%xmm5,%xmm5
+	punpcklqdq %xmm6,%xmm6  # propagate to all 16
+	punpcklqdq %xmm5,%xmm5
+	# xmm6 now contains first symbol in each byte, xmm5 the second
+
+	movdqa thirtyones,%xmm7
+	
+	# each invocation of this macro does 16 butterflies in parallel
+	.MACRO butterfly GROUP
+	# compute branch metrics
+	movdqa Branchtab27_sse2+(16*\GROUP),%xmm4
+	movdqa Branchtab27_sse2+32+(16*\GROUP),%xmm3
+	pxor %xmm6,%xmm4
+	pxor %xmm5,%xmm3
+	
+	# compute 5-bit branch metric in xmm4 by adding the individual symbol metrics
+	# This is okay for this
+	# code because the worst-case metric spread (at high Eb/No) is only 120,
+	# well within the range of our unsigned 8-bit path metrics, and even within
+	# the range of signed 8-bit path metrics
+	pavgb %xmm3,%xmm4
+	psrlw $3,%xmm4
+
+	pand %xmm7,%xmm4
+
+	movdqa (16*\GROUP)(%esi),%xmm0	# Incoming path metric, high bit = 0
+	movdqa ((16*\GROUP)+32)(%esi),%xmm3	# Incoming path metric, high bit = 1
+	movdqa %xmm0,%xmm2
+	movdqa %xmm3,%xmm1
+	paddusb %xmm4,%xmm0	# note use of saturating arithmetic
+	paddusb %xmm4,%xmm3	# this shouldn't be necessary, but why not?
+	
+	# negate branch metrics
+	pxor %xmm7,%xmm4
+	paddusb %xmm4,%xmm1
+	paddusb %xmm4,%xmm2	
+	
+	# Find survivors, leave in mm0,2
+	pminub %xmm1,%xmm0
+	pminub %xmm3,%xmm2
+	# get decisions, leave in mm1,3
+	pcmpeqb %xmm0,%xmm1
+	pcmpeqb %xmm2,%xmm3
+	
+	# interleave and store new branch metrics in mm0,2
+	movdqa %xmm0,%xmm4
+	punpckhbw %xmm2,%xmm0	# interleave second 16 new metrics
+	punpcklbw %xmm2,%xmm4	# interleave first 16 new metrics
+	movdqa %xmm0,(32*\GROUP+16)(%edi)
+	movdqa %xmm4,(32*\GROUP)(%edi)
+
+	# interleave decisions & store
+	movdqa %xmm1,%xmm4
+	punpckhbw %xmm3,%xmm1
+	punpcklbw %xmm3,%xmm4
+	# work around bug in gas due to Intel doc error
+	.byte 0x66,0x0f,0xd7,0xd9	# pmovmskb %xmm1,%ebx
+	shll $16,%ebx
+	.byte 0x66,0x0f,0xd7,0xc4	# pmovmskb %xmm4,%eax
+	orl %eax,%ebx
+	movl %ebx,(4*\GROUP)(%edx)
+	.endm
+
+	# invoke macro 2 times for a total of 32 butterflies
+	butterfly GROUP=0
+	butterfly GROUP=1
+
+	addl $8,%edx		# bump decision pointer
+		
+	# See if we have to normalize. This requires an explanation. We don't want
+	# our path metrics to exceed 255 on the *next* iteration. Since the
+	# largest branch metric is 30, that means we don't want any to exceed 225
+	# on *this* iteration. Rather than look them all, we just pick an arbitrary one
+	# (the first) and see if it exceeds 225-120=105, where 120 is the experimentally-
+	# determined worst-case metric spread for this code and branch metrics in the range 0-30.
+	
+	# This is extremely conservative, and empirical testing at a variety of Eb/Nos might
+	# show that a higher threshold could be used without affecting BER performance
+	movl (%edi),%eax	# extract first output metric
+	andl $255,%eax
+	cmp $105,%eax
+	jle done		# No, no need to normalize
+
+	# Normalize by finding smallest metric and subtracting it
+	# from all metrics. We can't just pick an arbitrary small constant because
+	# the minimum metric might be zero!
+	movdqa (%edi),%xmm0
+	movdqa %xmm0,%xmm4	
+	movdqa 16(%edi),%xmm1
+	pminub %xmm1,%xmm4
+	movdqa 32(%edi),%xmm2
+	pminub %xmm2,%xmm4	
+	movdqa 48(%edi),%xmm3	
+	pminub %xmm3,%xmm4
+
+	# crunch down to single lowest metric
+	movdqa %xmm4,%xmm5
+	psrldq $8,%xmm5     # the count to psrldq is bytes, not bits!
+	pminub %xmm5,%xmm4
+	movdqa %xmm4,%xmm5
+	psrlq $32,%xmm5
+	pminub %xmm5,%xmm4
+	movdqa %xmm4,%xmm5
+	psrlq $16,%xmm5
+	pminub %xmm5,%xmm4
+	movdqa %xmm4,%xmm5
+	psrlq $8,%xmm5
+	pminub %xmm5,%xmm4	# now in lowest byte of %xmm4
+
+	punpcklbw %xmm4,%xmm4	# lowest 2 bytes
+	pshuflw $0,%xmm4,%xmm4  # lowest 8 bytes
+	punpcklqdq %xmm4,%xmm4	# all 16 bytes
+	
+	# xmm4 now contains lowest metric in all 16 bytes
+	# subtract it from every output metric
+	psubusb %xmm4,%xmm0
+	psubusb %xmm4,%xmm1
+	psubusb %xmm4,%xmm2
+	psubusb %xmm4,%xmm3	
+	movdqa %xmm0,(%edi)
+	movdqa %xmm1,16(%edi)	
+	movdqa %xmm2,32(%edi)	
+	movdqa %xmm3,48(%edi)	
+	
+done:		
+	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+	
+2:	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+	ret
+
+	.data
+	.align 16
+
+thirtyones:
+	.byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
diff --git a/libfec/sse2bfly29-64.s b/libfec/sse2bfly29-64.s
new file mode 100644
index 0000000..22bd8a1
--- /dev/null
+++ b/libfec/sse2bfly29-64.s
@@ -0,0 +1,254 @@
+/* Intel SIMD SSE2 implementation of Viterbi ACS butterflies
+   for 256-state (k=9) convolutional code
+   Copyright 2004 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   Modifications for x86_64, 2012 Matthias P. Braendli, HB9EGM
+   - changed registers to x86-64 equivalents
+   - changed instructions accordingly
+   - %rip indirect addressing needed for position independent code,
+     which is required because x86-64 needs dynamic libs to be PIC.
+     That still doesn't work
+
+   void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ; 
+*/
+	# SSE2 (128-bit integer SIMD) version
+    # All X86-64 CPUs include SSE2
+
+	# These are offsets into struct v29, defined in viterbi29_av.c
+	.set DP,512
+	.set OLDMETRICS,516
+	.set NEWMETRICS,520
+
+	.text	
+	.global update_viterbi29_blk_sse2,Branchtab29_sse2
+	.type update_viterbi29_blk_sse2,@function
+	.align 16
+	
+update_viterbi29_blk_sse2:
+	pushq %rbp
+	movq %rsp,%rbp
+    /* convention different between i386 and x86_64: rsi and rdi belong to called function, not caller */
+    /* Let's say we don't care (yet) */
+	pushq %rsi
+	pushq %rdi
+	pushq %rdx
+	pushq %rbx
+	
+	movq 8(%rbp),%rdx	# edx = vp
+	testq %rdx,%rdx
+	jnz  0f
+	movq -1,%rax
+	jmp  err		
+0:	movq OLDMETRICS(%rdx),%rsi	# esi -> old metrics
+	movq NEWMETRICS(%rdx),%rdi	# edi -> new metrics
+	movq DP(%rdx),%rdx	# edx -> decisions
+
+1:	movq 16(%rbp),%rax	# eax = nbits
+	decq %rax
+	jl   2f			# passed zero, we're done
+	movq %rax,16(%rbp)
+
+	xorq %rax,%rax
+	movq 12(%rbp),%rbx	# ebx = syms
+	movb (%rbx),%al
+	movd %rax,%xmm6		# xmm6[0] = first symbol
+	movb 1(%rbx),%al
+	movd %rax,%xmm5		# xmm5[0] = second symbol
+	addq $2,%rbx
+	movq %rbx,12(%rbp)
+
+	punpcklbw %xmm6,%xmm6	# xmm6[1] = xmm6[0]
+	punpcklbw %xmm5,%xmm5
+	movdqa thirtyones(%rip),%xmm7
+	pshuflw $0,%xmm6,%xmm6	# copy low word to low 3
+	pshuflw $0,%xmm5,%xmm5
+	punpcklqdq %xmm6,%xmm6  # propagate to all 16
+	punpcklqdq %xmm5,%xmm5
+	# xmm6 now contains first symbol in each byte, xmm5 the second
+
+	movdqa thirtyones(%rip),%xmm7
+	
+	# each invocation of this macro does 16 butterflies in parallel
+	.MACRO butterfly GROUP
+	# compute branch metrics
+	movdqa Branchtab29_sse2+(16*\GROUP)(%rip),%xmm4
+	movdqa Branchtab29_sse2+128+(16*\GROUP)(%rip),%xmm3
+	pxor %xmm6,%xmm4
+	pxor %xmm5,%xmm3
+	pavgb %xmm3,%xmm4
+	psrlw $3,%xmm4
+
+	pand %xmm7,%xmm4	# xmm4 contains branch metrics
+	
+	movdqa (16*\GROUP)(%esi),%xmm0	# Incoming path metric, high bit = 0
+	movdqa ((16*\GROUP)+128)(%esi),%xmm3	# Incoming path metric, high bit = 1
+	movdqa %xmm0,%xmm2
+	movdqa %xmm3,%xmm1
+	paddusb %xmm4,%xmm0
+	paddusb %xmm4,%xmm3
+	
+	# invert branch metrics
+	pxor %xmm7,%xmm4
+	
+	paddusb %xmm4,%xmm1
+	paddusb %xmm4,%xmm2
+	
+	# Find survivors, leave in mm0,2
+	pminub %xmm1,%xmm0
+	pminub %xmm3,%xmm2
+	# get decisions, leave in mm1,3
+	pcmpeqb %xmm0,%xmm1
+	pcmpeqb %xmm2,%xmm3
+	
+	# interleave and store new branch metrics in mm0,2
+	movdqa %xmm0,%xmm4
+	punpckhbw %xmm2,%xmm0	# interleave second 16 new metrics
+	punpcklbw %xmm2,%xmm4	# interleave first 16 new metrics
+	movdqa %xmm0,(32*\GROUP+16)(%rdi)
+	movdqa %xmm4,(32*\GROUP)(%rdi)
+
+	# interleave decisions & store
+	movdqa %xmm1,%xmm4
+	punpckhbw %xmm3,%xmm1
+	punpcklbw %xmm3,%xmm4
+	# work around bug in gas due to Intel doc error
+	.byte 0x66,0x0f,0xd7,0xd9	# pmovmskb %xmm1,%ebx
+	shlq $16,%rbx
+	.byte 0x66,0x0f,0xd7,0xc4	# pmovmskb %xmm4,%eax
+	orq %rax,%rbx
+	movq %rbx,(4*\GROUP)(%rdx)
+	.endm
+
+	# invoke macro 8 times for a total of 128 butterflies
+	butterfly GROUP=0
+	butterfly GROUP=1
+	butterfly GROUP=2
+	butterfly GROUP=3
+	butterfly GROUP=4
+	butterfly GROUP=5
+	butterfly GROUP=6
+	butterfly GROUP=7
+
+	addq $32,%rdx		# bump decision pointer
+		
+	# see if we have to normalize
+	movq (%rdi),%rax	# extract first output metric
+	andq $255,%rax
+	cmp $50,%rax		# is it greater than 50?
+	movq $0,%rax
+	jle done		# No, no need to normalize
+
+	# Normalize by finding smallest metric and subtracting it
+	# from all metrics
+	movdqa (%rdi),%xmm0
+	pminub 16(%rdi),%xmm0
+	pminub 32(%rdi),%xmm0
+	pminub 48(%rdi),%xmm0
+	pminub 64(%rdi),%xmm0
+	pminub 80(%rdi),%xmm0
+	pminub 96(%rdi),%xmm0	
+	pminub 112(%rdi),%xmm0	
+	pminub 128(%rdi),%xmm0
+	pminub 144(%rdi),%xmm0
+	pminub 160(%rdi),%xmm0
+	pminub 176(%rdi),%xmm0
+	pminub 192(%rdi),%xmm0
+	pminub 208(%rdi),%xmm0
+	pminub 224(%rdi),%xmm0
+	pminub 240(%rdi),%xmm0							
+
+	# crunch down to single lowest metric
+	movdqa %xmm0,%xmm1
+	psrldq $8,%xmm0     # the count to psrldq is bytes, not bits!
+	pminub %xmm1,%xmm0
+	movdqa %xmm0,%xmm1
+	psrlq $32,%xmm0
+	pminub %xmm1,%xmm0
+	movdqa %xmm0,%xmm1
+	psrlq $16,%xmm0
+	pminub %xmm1,%xmm0
+	movdqa %xmm0,%xmm1
+	psrlq $8,%xmm0
+	pminub %xmm1,%xmm0
+
+	punpcklbw %xmm0,%xmm0	# lowest 2 bytes
+	pshuflw $0,%xmm0,%xmm0  # lowest 8 bytes
+	punpcklqdq %xmm0,%xmm0	# all 16 bytes
+
+	# xmm0 now contains lowest metric in all 16 bytes
+	# subtract it from every output metric
+	movdqa (%rdi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,(%rdi)
+	movdqa 16(%rdi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,16(%rdi)	
+	movdqa 32(%rdi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,32(%rdi)	
+	movdqa 48(%rdi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,48(%rdi)	
+	movdqa 64(%rdi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,64(%rdi)	
+	movdqa 80(%rdi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,80(%rdi)	
+	movdqa 96(%rdi),%xmm1	
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,96(%rdi)	
+	movdqa 112(%rdi),%xmm1	
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,112(%rdi)	
+	movdqa 128(%rdi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,128(%rdi)	
+	movdqa 144(%rdi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,144(%rdi)	
+	movdqa 160(%rdi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,160(%rdi)	
+	movdqa 176(%rdi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,176(%rdi)	
+	movdqa 192(%rdi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,192(%rdi)	
+	movdqa 208(%rdi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,208(%rdi)	
+	movdqa 224(%rdi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,224(%rdi)	
+	movdqa 240(%rdi),%xmm1							
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,240(%rdi)	
+	
+done:		
+	# swap metrics
+	movq %rsi,%rax
+	movq %rdi,%rsi
+	movq %rax,%rdi
+	jmp 1b
+	
+2:	movq 8(%rbp),%rbx	# ebx = vp
+	# stash metric pointers
+	movq %rsi,OLDMETRICS(%rbx)
+	movq %rdi,NEWMETRICS(%rbx)
+	movq %rdx,DP(%rbx)	# stash incremented value of vp->dp
+	xorq %rax,%rax
+err:	popq %rbx
+	popq %rdx
+	popq %rdi
+	popq %rsi
+	popq %rbp
+	ret
+	
+	.data
+	.align 16
+thirtyones:	
+	.byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
+
diff --git a/libfec/sse2bfly29.s b/libfec/sse2bfly29.s
new file mode 100644
index 0000000..0fa1742
--- /dev/null
+++ b/libfec/sse2bfly29.s
@@ -0,0 +1,245 @@
+/* Intel SIMD SSE2 implementation of Viterbi ACS butterflies
+   for 256-state (k=9) convolutional code
+   Copyright 2004 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ; 
+*/
+
+	# SSE2 (128-bit integer SIMD) version
+	# Requires Pentium 4 or better
+	# These are offsets into struct v29, defined in viterbi29.h
+	.set DP,512
+	.set OLDMETRICS,516
+	.set NEWMETRICS,520
+
+	.text	
+	.global update_viterbi29_blk_sse2,Branchtab29_sse2
+	.type update_viterbi29_blk_sse2,@function
+	.align 16
+	
+update_viterbi29_blk_sse2:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+	
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+	
+	xorl %eax,%eax
+	movl 12(%ebp),%ebx	# ebx = syms
+	movb (%ebx),%al
+	movd %eax,%xmm6		# xmm6[0] = first symbol
+	movb 1(%ebx),%al
+	movd %eax,%xmm5		# xmm5[0] = second symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	punpcklbw %xmm6,%xmm6	# xmm6[1] = xmm6[0]
+	punpcklbw %xmm5,%xmm5
+	movdqa thirtyones,%xmm7
+	pshuflw $0,%xmm6,%xmm6	# copy low word to low 3
+	pshuflw $0,%xmm5,%xmm5
+	punpcklqdq %xmm6,%xmm6  # propagate to all 16
+	punpcklqdq %xmm5,%xmm5
+	# xmm6 now contains first symbol in each byte, xmm5 the second
+
+	movdqa thirtyones,%xmm7
+	
+	# each invocation of this macro does 16 butterflies in parallel
+	.MACRO butterfly GROUP
+	# compute branch metrics
+	movdqa Branchtab29_sse2+(16*\GROUP),%xmm4
+	movdqa Branchtab29_sse2+128+(16*\GROUP),%xmm3
+	pxor %xmm6,%xmm4
+	pxor %xmm5,%xmm3
+	pavgb %xmm3,%xmm4
+	psrlw $3,%xmm4
+
+	pand %xmm7,%xmm4	# xmm4 contains branch metrics
+	
+	movdqa (16*\GROUP)(%esi),%xmm0	# Incoming path metric, high bit = 0
+	movdqa ((16*\GROUP)+128)(%esi),%xmm3	# Incoming path metric, high bit = 1
+	movdqa %xmm0,%xmm2
+	movdqa %xmm3,%xmm1
+	paddusb %xmm4,%xmm0
+	paddusb %xmm4,%xmm3
+	
+	# invert branch metrics
+	pxor %xmm7,%xmm4
+	
+	paddusb %xmm4,%xmm1
+	paddusb %xmm4,%xmm2
+	
+	# Find survivors, leave in mm0,2
+	pminub %xmm1,%xmm0
+	pminub %xmm3,%xmm2
+	# get decisions, leave in mm1,3
+	pcmpeqb %xmm0,%xmm1
+	pcmpeqb %xmm2,%xmm3
+	
+	# interleave and store new branch metrics in mm0,2
+	movdqa %xmm0,%xmm4
+	punpckhbw %xmm2,%xmm0	# interleave second 16 new metrics
+	punpcklbw %xmm2,%xmm4	# interleave first 16 new metrics
+	movdqa %xmm0,(32*\GROUP+16)(%edi)
+	movdqa %xmm4,(32*\GROUP)(%edi)
+
+	# interleave decisions & store
+	movdqa %xmm1,%xmm4
+	punpckhbw %xmm3,%xmm1
+	punpcklbw %xmm3,%xmm4
+	# work around bug in gas due to Intel doc error
+	.byte 0x66,0x0f,0xd7,0xd9	# pmovmskb %xmm1,%ebx
+	shll $16,%ebx
+	.byte 0x66,0x0f,0xd7,0xc4	# pmovmskb %xmm4,%eax
+	orl %eax,%ebx
+	movl %ebx,(4*\GROUP)(%edx)
+	.endm
+
+	# invoke macro 8 times for a total of 128 butterflies
+	butterfly GROUP=0
+	butterfly GROUP=1
+	butterfly GROUP=2
+	butterfly GROUP=3
+	butterfly GROUP=4
+	butterfly GROUP=5
+	butterfly GROUP=6
+	butterfly GROUP=7
+
+	addl $32,%edx		# bump decision pointer
+		
+	# see if we have to normalize
+	movl (%edi),%eax	# extract first output metric
+	andl $255,%eax
+	cmp $50,%eax		# is it greater than 50?
+	movl $0,%eax
+	jle done		# No, no need to normalize
+
+	# Normalize by finding smallest metric and subtracting it
+	# from all metrics
+	movdqa (%edi),%xmm0
+	pminub 16(%edi),%xmm0
+	pminub 32(%edi),%xmm0
+	pminub 48(%edi),%xmm0
+	pminub 64(%edi),%xmm0
+	pminub 80(%edi),%xmm0
+	pminub 96(%edi),%xmm0	
+	pminub 112(%edi),%xmm0	
+	pminub 128(%edi),%xmm0
+	pminub 144(%edi),%xmm0
+	pminub 160(%edi),%xmm0
+	pminub 176(%edi),%xmm0
+	pminub 192(%edi),%xmm0
+	pminub 208(%edi),%xmm0
+	pminub 224(%edi),%xmm0
+	pminub 240(%edi),%xmm0							
+
+	# crunch down to single lowest metric
+	movdqa %xmm0,%xmm1
+	psrldq $8,%xmm0     # the count to psrldq is bytes, not bits!
+	pminub %xmm1,%xmm0
+	movdqa %xmm0,%xmm1
+	psrlq $32,%xmm0
+	pminub %xmm1,%xmm0
+	movdqa %xmm0,%xmm1
+	psrlq $16,%xmm0
+	pminub %xmm1,%xmm0
+	movdqa %xmm0,%xmm1
+	psrlq $8,%xmm0
+	pminub %xmm1,%xmm0
+
+	punpcklbw %xmm0,%xmm0	# lowest 2 bytes
+	pshuflw $0,%xmm0,%xmm0  # lowest 8 bytes
+	punpcklqdq %xmm0,%xmm0	# all 16 bytes
+
+	# xmm0 now contains lowest metric in all 16 bytes
+	# subtract it from every output metric
+	movdqa (%edi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,(%edi)
+	movdqa 16(%edi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,16(%edi)	
+	movdqa 32(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,32(%edi)	
+	movdqa 48(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,48(%edi)	
+	movdqa 64(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,64(%edi)	
+	movdqa 80(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,80(%edi)	
+	movdqa 96(%edi),%xmm1	
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,96(%edi)	
+	movdqa 112(%edi),%xmm1	
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,112(%edi)	
+	movdqa 128(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,128(%edi)	
+	movdqa 144(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,144(%edi)	
+	movdqa 160(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,160(%edi)	
+	movdqa 176(%edi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,176(%edi)	
+	movdqa 192(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,192(%edi)	
+	movdqa 208(%edi),%xmm1
+	psubusb %xmm0,%xmm1
+	movdqa %xmm1,208(%edi)	
+	movdqa 224(%edi),%xmm1
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,224(%edi)	
+	movdqa 240(%edi),%xmm1							
+	psubusb %xmm0,%xmm1	
+	movdqa %xmm1,240(%edi)	
+	
+done:		
+	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+	
+2:	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+	ret
+	
+	.data
+	.align 16
+thirtyones:	
+	.byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
+
diff --git a/libfec/ssebfly27.s b/libfec/ssebfly27.s
new file mode 100644
index 0000000..7f445da
--- /dev/null
+++ b/libfec/ssebfly27.s
@@ -0,0 +1,205 @@
+/* Intel SIMD (SSE) implementation of Viterbi ACS butterflies
+   for 64-state (k=7) convolutional code
+   Copyright 2001 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   int update_viterbi27_blk_sse(struct v27 *vp,unsigned char syms[],int nbits) ; 
+*/
+
+	# SSE (64-bit integer SIMD) version
+	# Requires Pentium III or better
+
+	# These are offsets into struct v27, defined in viterbi27.h
+	.set DP,128
+	.set OLDMETRICS,132
+	.set NEWMETRICS,136
+.text	
+.global update_viterbi27_blk_sse,Branchtab27_sse
+	.type update_viterbi27_blk_sse,@function
+	.align 16
+	
+update_viterbi27_blk_sse:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+	
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+
+	xorl %eax,%eax
+	movl 12(%ebp),%ebx	# %ebx = syms
+	movb (%ebx),%al
+	movd %eax,%mm6		# mm6[0] = first symbol
+	movb 1(%ebx),%al
+	movd %eax,%mm5		# mm5[0] = second symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	punpcklbw %mm6,%mm6	# mm6[1] = mm6[0]
+	punpcklbw %mm5,%mm5
+	movq thirtyones,%mm7
+
+	pshufw $0,%mm6,%mm6	# copy low word to upper 3
+	pshufw $0,%mm5,%mm5
+	# mm6 now contains first symbol in each byte, mm5 the second
+
+	# each invocation of this macro does 8 butterflies in parallel
+	.MACRO butterfly GROUP
+	# compute branch metrics
+	movq Branchtab27_sse+(8*\GROUP),%mm4
+	movq Branchtab27_sse+32+(8*\GROUP),%mm3
+	pxor %mm6,%mm4
+	pxor %mm5,%mm3
+	pavgb %mm3,%mm4			# mm4 contains branch metrics
+	psrlw $3,%mm4
+	pand %mm7,%mm4
+	
+	movq (8*\GROUP)(%esi),%mm0	# Incoming path metric, high bit = 0
+	movq ((8*\GROUP)+32)(%esi),%mm3	# Incoming path metric, high bit = 1
+	movq %mm0,%mm2
+	movq %mm3,%mm1
+	paddusb %mm4,%mm0
+	paddusb %mm4,%mm3
+	
+	# invert branch metrics. This works only because they're 5 bits
+	pxor %mm7,%mm4
+	
+	paddusb %mm4,%mm1
+	paddusb %mm4,%mm2
+	
+	# Find survivors, leave in mm0,2
+	pminub %mm1,%mm0
+	pminub %mm3,%mm2
+	# get decisions, leave in mm1,3
+	pcmpeqb %mm0,%mm1
+	pcmpeqb %mm2,%mm3
+	
+	# interleave and store new branch metrics in mm0,2
+	movq %mm0,%mm4
+	punpckhbw %mm2,%mm0	# interleave second 8 new metrics
+	punpcklbw %mm2,%mm4	# interleave first 8 new metrics
+	movq %mm0,(16*\GROUP+8)(%edi)
+	movq %mm4,(16*\GROUP)(%edi)
+
+	# interleave decisions, accumulate into %ebx
+	movq %mm1,%mm4
+	punpckhbw %mm3,%mm1
+	punpcklbw %mm3,%mm4
+	# Due to an error in the Intel instruction set ref (the register
+	# fields are swapped), gas assembles pmovmskb incorrectly
+	# See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
+	.byte 0x0f,0xd7,0xc1	# pmovmskb %mm1,%eax
+	shll $((16*\GROUP+8)&31),%eax
+	orl %eax,%ebx
+	.byte 0x0f,0xd7,0xc4	# pmovmskb %mm4,%eax
+	shll $((16*\GROUP)&31),%eax
+	orl %eax,%ebx
+	.endm
+
+	# invoke macro 4 times for a total of 32 butterflies
+	xorl %ebx,%ebx		# clear decisions
+	butterfly GROUP=0
+	butterfly GROUP=1
+	movl %ebx,(%edx)	# stash first 32 decisions
+	xorl %ebx,%ebx
+	butterfly GROUP=2
+	butterfly GROUP=3
+	movl %ebx,4(%edx)	# stash second 32 decisions
+
+	addl $8,%edx		# bump decision pointer
+		
+	# see if we have to normalize
+	movl (%edi),%eax	# extract first output metric
+	andl $255,%eax
+	cmpl $150,%eax		# is it greater than 150?
+	movl $0,%eax
+	jle done		# No, no need to normalize
+
+	# Normalize by finding smallest metric and subtracting it
+	# from all metrics
+	movq (%edi),%mm0
+	pminub 8(%edi),%mm0
+	pminub 16(%edi),%mm0
+	pminub 24(%edi),%mm0
+	pminub 32(%edi),%mm0
+	pminub 40(%edi),%mm0
+	pminub 48(%edi),%mm0
+	pminub 56(%edi),%mm0
+	# mm0 contains 8 smallest metrics
+	# crunch down to single lowest metric
+	movq %mm0,%mm1
+	psrlq $32,%mm0
+	pminub %mm1,%mm0
+	movq %mm0,%mm1
+	psrlq $16,%mm0
+	pminub %mm1,%mm0
+	movq %mm0,%mm1
+	psrlq $8,%mm0
+	pminub %mm1,%mm0
+	punpcklbw %mm0,%mm0	# expand to all 8 bytes
+	pshufw $0,%mm0,%mm0
+
+	# mm0 now contains lowest metric in all 8 bytes
+	# subtract it from every output metric
+	# Trashes %mm7
+	.macro PSUBUSBM REG,MEM
+	movq \MEM,%mm7
+	psubusb \REG,%mm7
+	movq %mm7,\MEM
+	.endm
+	
+	PSUBUSBM %mm0,(%edi)
+	PSUBUSBM %mm0,8(%edi)
+	PSUBUSBM %mm0,16(%edi)
+	PSUBUSBM %mm0,24(%edi)
+	PSUBUSBM %mm0,32(%edi)
+	PSUBUSBM %mm0,40(%edi)
+	PSUBUSBM %mm0,48(%edi)
+	PSUBUSBM %mm0,56(%edi)
+
+	movd %mm0,%eax
+	and $0xff,%eax
+
+done:	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+	
+2:	emms
+	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+
+	ret
+
+	.data
+	
+	.align 16
+thirtyones:
+	.byte 31,31,31,31,31,31,31,31
+	
+	
+
diff --git a/libfec/ssebfly29.s b/libfec/ssebfly29.s
new file mode 100644
index 0000000..d7d2149
--- /dev/null
+++ b/libfec/ssebfly29.s
@@ -0,0 +1,271 @@
+/* Intel SIMD SSE implementation of Viterbi ACS butterflies
+   for 256-state (k=9) convolutional code
+   Copyright 2004 Phil Karn, KA9Q
+   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
+
+   void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits); 
+*/
+	# SSE (64-bit integer SIMD) version
+	# Requires Pentium III or better
+	# These are offsets into struct v29, defined in viterbi29.h
+	.set DP,512
+	.set OLDMETRICS,516
+	.set NEWMETRICS,520
+	.text	
+	.global update_viterbi29_blk_sse,Branchtab29_sse
+	.type update_viterbi29_blk_sse,@function
+	.align 16
+	
+update_viterbi29_blk_sse:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %edi
+	pushl %edx
+	pushl %ebx
+
+	movl 8(%ebp),%edx	# edx = vp
+	testl %edx,%edx
+	jnz  0f
+	movl -1,%eax
+	jmp  err		
+0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
+	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
+	movl DP(%edx),%edx	# edx -> decisions
+
+1:	movl 16(%ebp),%eax	# eax = nbits
+	decl %eax
+	jl   2f			# passed zero, we're done
+	movl %eax,16(%ebp)
+	
+	xorl %eax,%eax
+	movl 12(%ebp),%ebx	# ebx = syms
+	movb (%ebx),%al
+	movd %eax,%mm6		# mm6[0] = first symbol
+	movb 1(%ebx),%al
+	movd %eax,%mm5		# mm5[0] = second symbol
+	addl $2,%ebx
+	movl %ebx,12(%ebp)
+
+	punpcklbw %mm6,%mm6	# mm6[1] = mm6[0]
+	punpcklbw %mm5,%mm5
+
+	movq thirtyones,%mm7
+	pshufw $0,%mm6,%mm6	# copy low word to upper 3
+	pshufw $0,%mm5,%mm5
+	# mm6 now contains first symbol in each byte, mm5 the second
+
+	# each invocation of this macro does 8 butterflies in parallel
+	.MACRO butterfly GROUP
+	# compute branch metrics
+	movq Branchtab29_sse+(8*\GROUP),%mm4
+	movq Branchtab29_sse+128+(8*\GROUP),%mm3
+	pxor %mm6,%mm4
+	pxor %mm5,%mm3
+	pavgb %mm3,%mm4			# mm4 contains branch metrics
+	psrlw $3,%mm4
+	pand %mm7,%mm4
+
+	movq (8*\GROUP)(%esi),%mm0	# Incoming path metric, high bit = 0
+	movq ((8*\GROUP)+128)(%esi),%mm3	# Incoming path metric, high bit = 1
+	movq %mm0,%mm2
+	movq %mm3,%mm1
+	paddusb %mm4,%mm0
+	paddusb %mm4,%mm3
+	
+	# invert branch metrics. This works only because they're 5 bits
+	pxor %mm7,%mm4
+	
+	paddusb %mm4,%mm1
+	paddusb %mm4,%mm2
+	
+	# Find survivors, leave in mm0,2
+	pminub %mm1,%mm0
+	pminub %mm3,%mm2
+	# get decisions, leave in mm1,3
+	pcmpeqb %mm0,%mm1
+	pcmpeqb %mm2,%mm3
+	
+	# interleave and store new branch metrics in mm0,2
+	movq %mm0,%mm4
+	punpckhbw %mm2,%mm0	# interleave second 8 new metrics
+	punpcklbw %mm2,%mm4	# interleave first 8 new metrics
+	movq %mm0,(16*\GROUP+8)(%edi)
+	movq %mm4,(16*\GROUP)(%edi)
+
+	# interleave decisions, accumulate into %ebx
+	movq %mm1,%mm4
+	punpckhbw %mm3,%mm1
+	punpcklbw %mm3,%mm4
+	# Due to an error in the Intel instruction set ref (the register
+	# fields are swapped), gas assembles pmovmskb incorrectly
+	# See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html
+	.byte 0x0f,0xd7,0xc1	# pmovmskb %mm1,%eax
+	shll $((16*\GROUP+8)&31),%eax
+	orl %eax,%ebx
+	.byte 0x0f,0xd7,0xc4	# pmovmskb %mm4,%eax
+	shll $((16*\GROUP)&31),%eax
+	orl %eax,%ebx
+	.endm
+
+	# invoke macro 16 times for a total of 128 butterflies
+	xorl %ebx,%ebx		# clear decisions
+	butterfly GROUP=0
+	butterfly GROUP=1
+	movl %ebx,(%edx)	# stash first 32 decisions
+	xorl %ebx,%ebx
+	butterfly GROUP=2
+	butterfly GROUP=3
+	movl %ebx,4(%edx)	# stash second 32 decisions
+	xorl %ebx,%ebx		# clear decisions
+	butterfly GROUP=4
+	butterfly GROUP=5
+	movl %ebx,8(%edx)	# stash first 32 decisions
+	xorl %ebx,%ebx
+	butterfly GROUP=6
+	butterfly GROUP=7
+	movl %ebx,12(%edx)	# stash second 32 decisions
+	xorl %ebx,%ebx		# clear decisions
+	butterfly GROUP=8
+	butterfly GROUP=9
+	movl %ebx,16(%edx)	# stash first 32 decisions
+	xorl %ebx,%ebx
+	butterfly GROUP=10
+	butterfly GROUP=11
+	movl %ebx,20(%edx)	# stash second 32 decisions
+	xorl %ebx,%ebx		# clear decisions
+	butterfly GROUP=12
+	butterfly GROUP=13
+	movl %ebx,24(%edx)	# stash first 32 decisions
+	xorl %ebx,%ebx
+	butterfly GROUP=14
+	butterfly GROUP=15
+	movl %ebx,28(%edx)	# stash second 32 decisions
+
+	addl $32,%edx		# bump decision pointer
+		
+	# see if we have to normalize
+	movl (%edi),%eax	# extract first output metric
+	andl $255,%eax
+	cmp $50,%eax		# is it greater than 50?
+	movl $0,%eax
+	jle done		# No, no need to normalize
+
+	# Normalize by finding smallest metric and subtracting it
+	# from all metrics
+	movq (%edi),%mm0
+	pminub 8(%edi),%mm0
+	pminub 16(%edi),%mm0
+	pminub 24(%edi),%mm0
+	pminub 32(%edi),%mm0
+	pminub 40(%edi),%mm0
+	pminub 48(%edi),%mm0
+	pminub 56(%edi),%mm0
+	pminub 64(%edi),%mm0
+	pminub 72(%edi),%mm0
+	pminub 80(%edi),%mm0	
+	pminub 88(%edi),%mm0
+	pminub 96(%edi),%mm0
+	pminub 104(%edi),%mm0
+	pminub 112(%edi),%mm0
+	pminub 120(%edi),%mm0
+	pminub 128(%edi),%mm0
+	pminub 136(%edi),%mm0
+	pminub 144(%edi),%mm0
+	pminub 152(%edi),%mm0
+	pminub 160(%edi),%mm0
+	pminub 168(%edi),%mm0
+	pminub 176(%edi),%mm0
+	pminub 184(%edi),%mm0
+	pminub 192(%edi),%mm0
+	pminub 200(%edi),%mm0
+	pminub 208(%edi),%mm0
+	pminub 216(%edi),%mm0
+	pminub 224(%edi),%mm0
+	pminub 232(%edi),%mm0
+	pminub 240(%edi),%mm0
+	pminub 248(%edi),%mm0
+	# mm0 contains 8 smallest metrics
+	# crunch down to single lowest metric
+	movq %mm0,%mm1
+	psrlq $32,%mm0
+	pminub %mm1,%mm0
+	movq %mm0,%mm1
+	psrlq $16,%mm0
+	pminub %mm1,%mm0
+	movq %mm0,%mm1
+	psrlq $8,%mm0
+	pminub %mm1,%mm0
+	movq 8(%edi),%mm1	# reload
+	punpcklbw %mm0,%mm0	# expand to all 8 bytes
+	pshufw $0,%mm0,%mm0
+
+	# mm0 now contains lowest metric in all 8 bytes
+	# subtract it from every output metric
+	# Trashes %mm7
+	.macro PSUBUSBM REG,MEM
+	movq \MEM,%mm7
+	psubusb \REG,%mm7
+	movq %mm7,\MEM
+	.endm
+	
+	PSUBUSBM %mm0,(%edi)
+	PSUBUSBM %mm0,8(%edi)
+	PSUBUSBM %mm0,16(%edi)
+	PSUBUSBM %mm0,24(%edi)
+	PSUBUSBM %mm0,32(%edi)
+	PSUBUSBM %mm0,40(%edi)
+	PSUBUSBM %mm0,48(%edi)
+	PSUBUSBM %mm0,56(%edi)
+	PSUBUSBM %mm0,64(%edi)
+	PSUBUSBM %mm0,72(%edi)
+	PSUBUSBM %mm0,80(%edi)	
+	PSUBUSBM %mm0,88(%edi)
+	PSUBUSBM %mm0,96(%edi)
+	PSUBUSBM %mm0,104(%edi)
+	PSUBUSBM %mm0,112(%edi)
+	PSUBUSBM %mm0,120(%edi)
+	PSUBUSBM %mm0,128(%edi)
+	PSUBUSBM %mm0,136(%edi)
+	PSUBUSBM %mm0,144(%edi)
+	PSUBUSBM %mm0,152(%edi)
+	PSUBUSBM %mm0,160(%edi)
+	PSUBUSBM %mm0,168(%edi)
+	PSUBUSBM %mm0,176(%edi)
+	PSUBUSBM %mm0,184(%edi)
+	PSUBUSBM %mm0,192(%edi)
+	PSUBUSBM %mm0,200(%edi)
+	PSUBUSBM %mm0,208(%edi)
+	PSUBUSBM %mm0,216(%edi)
+	PSUBUSBM %mm0,224(%edi)
+	PSUBUSBM %mm0,232(%edi)
+	PSUBUSBM %mm0,240(%edi)
+	PSUBUSBM %mm0,248(%edi)
+
+done:		
+	# swap metrics
+	movl %esi,%eax
+	movl %edi,%esi
+	movl %eax,%edi
+	jmp 1b
+	
+2:	emms
+	movl 8(%ebp),%ebx	# ebx = vp
+	# stash metric pointers
+	movl %esi,OLDMETRICS(%ebx)
+	movl %edi,NEWMETRICS(%ebx)
+	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
+	xorl %eax,%eax
+err:	popl %ebx
+	popl %edx
+	popl %edi
+	popl %esi
+	popl %ebp
+	ret
+
+	.data
+	.align 8
+thirtyones:	
+	.byte 31,31,31,31,31,31,31,31
+	
+
diff --git a/libfec/sumsq.c b/libfec/sumsq.c
new file mode 100644
index 0000000..e567c89
--- /dev/null
+++ b/libfec/sumsq.c
@@ -0,0 +1,50 @@
+/* Compute the sum of the squares of a vector of signed shorts
+
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#include <stdlib.h>
+#include "fec.h"
+
+unsigned long long sumsq_port(signed short *,int);
+
+#ifdef __i386__
+unsigned long long sumsq_mmx(signed short *,int);
+unsigned long long sumsq_sse(signed short *,int);
+unsigned long long sumsq_sse2(signed short *,int);
+#endif
+
+#ifdef __x86_64__
+unsigned long long sumsq_sse2(signed short *,int);
+#endif
+
+#ifdef __VEC__
+unsigned long long sumsq_av(signed short *,int);
+#endif
+
+unsigned long long sumsq(signed short *in,int cnt){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return sumsq_port(in,cnt);
+#ifdef __i386__
+  case SSE:
+  case MMX:
+    return sumsq_mmx(in,cnt);
+  case SSE2:
+    return sumsq_sse2(in,cnt);
+#endif
+
+#ifdef __x86_64__
+  case SSE2:
+    return sumsq_port(in,cnt);
+    //return sumsq_sse2(in,cnt);
+#endif
+
+#ifdef __VEC__
+  case ALTIVEC:
+    return sumsq_av(in,cnt);
+#endif
+  }
+}
diff --git a/libfec/sumsq_av.c b/libfec/sumsq_av.c
new file mode 100644
index 0000000..53c6acf
--- /dev/null
+++ b/libfec/sumsq_av.c
@@ -0,0 +1,78 @@
+/* Compute the sum of the squares of a vector of signed shorts
+
+ * This is the Altivec SIMD version. It's a little hairy because Altivec
+ * does not do 64-bit operations directly, so we have to accumulate separate
+ * 32-bit sums and carries
+
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+#include "fec.h"
+
+unsigned long long sumsq_av(signed short *in,int cnt){
+  long long sum;
+  vector signed short x;
+  vector unsigned int sums,carries,s1,s2;
+  int pad;
+  union { vector unsigned char cv; vector unsigned int iv; unsigned int w[4]; unsigned char c[16];} s;
+
+  carries = sums = (vector unsigned int)(0);
+  if((pad = (int)in & 15)!=0){
+    /* Load unaligned leading word */
+    x = vec_perm(vec_ld(0,in),(vector signed short)(0),vec_lvsl(0,in));
+    if(cnt < 8){ /* Shift right to chop stuff beyond end of short block */
+      s.c[15] = (8-cnt)<<4;
+      x = vec_sro(x,s.cv);
+    }
+    sums = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
+    in += 8-pad/2;
+    cnt -= 8-pad/2;
+  }
+  /* Everything is now aligned, rip through most of the block */
+  while(cnt >= 8){
+    x = vec_ld(0,in);
+    /* A single vec_msum cannot overflow, but we have to sum it with
+     * the earlier terms separately to handle the carries
+     * The cast to unsigned is OK because squares are always positive
+     */
+    s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
+    carries = vec_add(carries,vec_addc(sums,s1));
+    sums = vec_add(sums,s1);
+    in += 8;
+    cnt -= 8;
+  }
+  /* Handle trailing fragment, if any */
+  if(cnt > 0){
+    x = vec_ld(0,in);
+    s.c[15] = (8-cnt)<<4;
+    x = vec_sro(x,s.cv);
+    s1 = (vector unsigned int)vec_msum(x,x,(vector signed int)(0));
+    carries = vec_add(carries,vec_addc(sums,s1));
+    sums = vec_add(sums,s1);
+  }
+  /* Combine 4 sub-sums and carries */
+  s.c[15] = 64; /* Shift right two 32-bit words */
+  s1 = vec_sro(sums,s.cv);
+  s2 = vec_sro(carries,s.cv);
+  carries = vec_add(carries,vec_addc(sums,s1));
+  sums = vec_add(sums,s1);
+  carries = vec_add(carries,s2);
+
+  s.c[15] = 32; /* Shift right one 32-bit word */
+  s1 = vec_sro(sums,s.cv);
+  s2 = vec_sro(carries,s.cv);
+  carries = vec_add(carries,vec_addc(sums,s1));
+  sums = vec_add(sums,s1);
+  carries = vec_add(carries,s2);
+
+  /* Extract sum and carries from right-hand words and combine into result */
+  s.iv = sums;
+  sum = s.w[3];
+
+  s.iv = carries;
+  sum += (long long)s.w[3] << 32;
+
+  return sum;
+}
+
diff --git a/libfec/sumsq_mmx.c b/libfec/sumsq_mmx.c
new file mode 100644
index 0000000..e766831
--- /dev/null
+++ b/libfec/sumsq_mmx.c
@@ -0,0 +1,35 @@
+/* Compute the sum of the squares of a vector of signed shorts
+
+ *  MMX-assisted version (also used on SSE)
+
+ * The SSE2 and MMX assist routines both operate on multiples of
+ * 8 words; they differ only in their alignment requirements (8 bytes
+ * for MMX, 16 bytes for SSE2)
+
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser Public License (LGPL)
+ */
+
+long long sumsq_mmx_assist(signed short *,int);
+
+long long sumsq_mmx(signed short *in,int cnt){
+  long long sum = 0;
+
+  /* Handle stuff before the next 8-byte boundary */
+  while(((int)in & 7) != 0 && cnt != 0){
+    sum += (long)in[0] * in[0];
+    in++;
+    cnt--;
+  }
+  sum += sumsq_mmx_assist(in,cnt);
+  in += cnt & ~7;
+  cnt &= 7;
+
+  /* Handle up to 7 words at end */
+  while(cnt != 0){
+    sum += (long)in[0] * in[0];
+    in++;
+    cnt--;
+  }
+  return sum;
+}
diff --git a/libfec/sumsq_mmx_assist.s b/libfec/sumsq_mmx_assist.s
new file mode 100644
index 0000000..b3bac66
--- /dev/null
+++ b/libfec/sumsq_mmx_assist.s
@@ -0,0 +1,83 @@
+# MMX assist routines for sumsq
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Public License (GPL)
+
+	.text
+
+# Evaluate sum of squares of signed 16-bit input samples
+#  long long sumsq_mmx_assist(signed short *in,int cnt);	
+	.global sumsq_mmx_assist
+	.type sumsq_mmx_assist,@function
+	.align 16
+sumsq_mmx_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+	pushl %ebx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+	xor %eax,%eax
+	xor %edx,%edx
+
+	# Since 4 * 32767**2 < 2**32, we can accumulate two at a time
+1:	subl $8,%ecx
+	jl 2f
+	movq (%esi),%mm0	# S0 S1 S2 S3
+	pmaddwd %mm0,%mm0	# (S0^2+S1^2) (S2^2+S3^2)
+	movq 8(%esi),%mm6	# S4 S5 S6 S7
+	pmaddwd %mm6,%mm6	# (S4^2+S5^2) (S6^2+S7^2)
+	paddd %mm6,%mm0		# (S0^2+S1^2+S4^2+S5^2)(S2^2+S3^2+S6^2+S7^2)
+	movd %mm0,%ebx
+	addl %ebx,%eax
+	adcl $0,%edx
+	psrlq $32,%mm0
+	movd %mm0,%ebx
+	addl %ebx,%eax
+	adcl $0,%edx
+	addl $16,%esi
+	jmp 1b
+	
+2:	emms
+	popl %ebx
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
+	
+# Evaluate sum of squares of signed 16-bit input samples
+#  long sumsq_wd_mmx_assist(signed short *in,int cnt);
+#  Quick version, only safe for small numbers of small input values...
+	.global sumsq_wd_mmx_assist
+	.type sumsq_wd_mmx_assist,@function
+	.align 16
+sumsq_wd_mmx_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+	pxor %mm2,%mm2		# zero sum
+
+1:	subl $8,%ecx
+	jl 2f
+	movq (%esi),%mm0	# S0 S1 S2 S3
+	pmaddwd %mm0,%mm0	# (S0*S0+S1*S1) (S2*S2+S3*S3)
+	movq 8(%esi),%mm1
+	pmaddwd %mm1,%mm1
+	paddd %mm1,%mm2
+	paddd %mm0,%mm2		# accumulate
+
+	addl $16,%esi
+	jmp 1b	
+
+2:	movd %mm2,%eax		# even sum	
+	psrlq $32,%mm2
+	movd %mm2,%edx		# odd sum
+	addl %edx,%eax
+	emms
+	popl %esi
+	popl %ebp
+	ret
diff --git a/libfec/sumsq_port.c b/libfec/sumsq_port.c
new file mode 100644
index 0000000..6d0b4c1
--- /dev/null
+++ b/libfec/sumsq_port.c
@@ -0,0 +1,16 @@
+/* Compute the sum of the squares of a vector of signed shorts
+
+ *  Portable C version
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+
+unsigned long long sumsq_port(signed short *in,int cnt){
+  long long sum = 0;
+  int i;
+
+  for(i=0;i<cnt;i++){
+    sum += (int)in[i] * (int)in[i];
+  }
+  return sum;
+}
diff --git a/libfec/sumsq_sse2.c b/libfec/sumsq_sse2.c
new file mode 100644
index 0000000..b05d2e9
--- /dev/null
+++ b/libfec/sumsq_sse2.c
@@ -0,0 +1,33 @@
+/* Compute the sum of the squares of a vector of signed shorts
+
+ * The SSE2 and MMX assist routines both operate on multiples of
+ * 8 words; they differ only in their alignment requirements (8 bytes
+ * for MMX, 16 bytes for SSE2)
+
+ * Copyright 2004 Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser Public License (LGPL)
+ */
+
+long long sumsq_sse2_assist(signed short *,int);
+
+long long sumsq_sse2(signed short *in,int cnt){
+  long long sum = 0;
+
+  /* Handle stuff before the next 8-byte boundary */
+  while(((int)in & 15) != 0 && cnt != 0){
+    sum += (long)in[0] * in[0];
+    in++;
+    cnt--;
+  }
+  sum += sumsq_sse2_assist(in,cnt);
+  in += cnt & ~7;
+  cnt &= 7;
+
+  /* Handle up to 7 trailing words */
+  while(cnt != 0){
+    sum += (long)in[0] * in[0];
+    in++;
+    cnt--;
+  }
+  return sum;
+}
diff --git a/libfec/sumsq_sse2_assist.s b/libfec/sumsq_sse2_assist.s
new file mode 100644
index 0000000..d1c4ee7
--- /dev/null
+++ b/libfec/sumsq_sse2_assist.s
@@ -0,0 +1,49 @@
+# SSE2 assist routines for sumsq
+# Copyright 2001 Phil Karn, KA9Q
+# May be used under the terms of the GNU Public License (GPL)
+
+	.text
+# Evaluate sum of squares of signed 16-bit input samples
+#  long long sumsq_sse2_assist(signed short *in,int cnt);	
+	.global sumsq_sse2_assist
+	.type sumsq_sse2_assist,@function
+	.align 16
+sumsq_sse2_assist:
+	pushl %ebp
+	movl %esp,%ebp
+	pushl %esi
+	pushl %ecx
+
+	movl 8(%ebp),%esi
+	movl 12(%ebp),%ecx
+	pxor %xmm2,%xmm2		# zero sum
+	movaps low,%xmm3		# load mask
+
+1:	subl $8,%ecx
+	jl 2f
+	movaps (%esi),%xmm0	# S0 S1 S2 S3 S4 S5 S6 S7
+	pmaddwd %xmm0,%xmm0	# (S0*S0+S1*S1) (S2*S2+S3*S3) (S4*S4+S5*S5) (S6*S6+S7*S7)
+	movaps %xmm0,%xmm1
+	pand %xmm3,%xmm1	# (S0*S0+S1*S1) 0 (S4*S4+S5*S5) 0
+	paddq %xmm1,%xmm2	# sum even-numbered dwords
+	psrlq $32,%xmm0		# (S2*S2+S3*S3) 0 (S6*S6+S7*S7) 0
+	paddq %xmm0,%xmm2	# sum odd-numbered dwords
+	addl $16,%esi
+	jmp 1b	
+
+2:	movaps %xmm2,%xmm0
+	psrldq $8,%xmm0
+	paddq %xmm2,%xmm0	# combine 64-bit sums
+
+	movd %xmm0,%eax		# low 32 bits of sum
+	psrldq $4,%xmm0
+	movd %xmm0,%edx		# high 32 bits of sum
+	
+	popl %ecx
+	popl %esi
+	popl %ebp
+	ret
+
+	.data
+	.align 16
+low:	.byte 255,255,255,255,0,0,0,0,255,255,255,255,0,0,0,0
diff --git a/libfec/sumsq_test.c b/libfec/sumsq_test.c
new file mode 100644
index 0000000..4debd47
--- /dev/null
+++ b/libfec/sumsq_test.c
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <time.h>
+#include "config.h"
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"frame-length",1,NULL,'l'},
+  {"frame-count",1,NULL,'n'},
+  {"verbose",0,NULL,'v'},
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {NULL},
+};
+#endif
+
+int Verbose = 0;
+
+int main(int argc,char *argv[]){
+  signed short *buf;
+  int i,d,trial,trials=10000;
+  int bufsize = 2048;
+  long long port_sum,simd_sum;
+  time_t t;
+  int timetrials=0;
+
+  find_cpu_mode();
+  time(&t);
+  srandom(t);
+
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"vapmstl:n:T",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"vapmstl:n:T")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'l':
+      bufsize = atoi(optarg);
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    case 'v':
+      Verbose++;
+      break;
+    case 'T':
+      timetrials++;
+      break;
+    }
+  }
+
+  buf = (signed short *)calloc(bufsize,sizeof(signed short));
+  if(timetrials){
+    for(trial=0;trial<trials;trial++){
+      (void)sumsq(buf,bufsize);
+    }
+  } else {
+    for(trial=0;trial<trials;trial++){
+      int length,offset;
+
+      offset = random() & 7;
+      length = (random() % bufsize) - offset;
+      if(length <= 0)
+	continue;
+      for(i=0;i<bufsize;i++)
+	buf[i] = random();
+      
+      port_sum = sumsq_port(buf+offset,length);
+      simd_sum = sumsq(buf+offset,length);
+      if(port_sum != simd_sum){
+	printf("offset %d len %d port_sum = %lld simd_sum = %lld ",offset,length,port_sum,simd_sum);
+	
+	printf("ERROR! diff = %lld\n",simd_sum-port_sum);
+      }
+    }
+  }
+  exit(0);
+}
diff --git a/libfec/viterbi27.c b/libfec/viterbi27.c
new file mode 100644
index 0000000..316fee4
--- /dev/null
+++ b/libfec/viterbi27.c
@@ -0,0 +1,188 @@
+/* K=7 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27(int len){
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return create_viterbi27_port(len);
+#ifdef __VEC__
+  case ALTIVEC:
+    return create_viterbi27_av(len);
+#endif
+#ifdef __i386__
+  case MMX:
+    return create_viterbi27_mmx(len);
+  case SSE:
+    return create_viterbi27_sse(len);
+  case SSE2:
+    return create_viterbi27_sse2(len);
+#endif
+#ifdef __x86_64__
+  case SSE2:
+    return create_viterbi27_port(len);
+#endif
+  }
+}
+
+void set_viterbi27_polynomial(int polys[2]){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    set_viterbi27_polynomial_port(polys);
+    break;
+#ifdef __VEC__
+  case ALTIVEC:
+    set_viterbi27_polynomial_av(polys);
+    break;
+#endif
+#ifdef __i386__
+  case MMX:
+    set_viterbi27_polynomial_mmx(polys);
+    break;
+  case SSE:
+    set_viterbi27_polynomial_sse(polys);
+    break;
+  case SSE2:
+    set_viterbi27_polynomial_sse2(polys);
+    break;
+#endif
+#ifdef __x86_64__
+  case SSE2:
+    set_viterbi27_polynomial_port(polys);
+    break;
+#endif
+  }
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27(void *p,int starting_state){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return init_viterbi27_port(p,starting_state);
+#ifdef __VEC__
+    case ALTIVEC:
+      return init_viterbi27_av(p,starting_state);
+#endif
+#ifdef __i386__
+    case MMX:
+      return init_viterbi27_mmx(p,starting_state);
+    case SSE:
+      return init_viterbi27_sse(p,starting_state);
+    case SSE2:
+      return init_viterbi27_sse2(p,starting_state);
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      return init_viterbi27_port(p,starting_state);
+#endif
+    }
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return chainback_viterbi27_port(p,data,nbits,endstate);
+#ifdef __VEC__
+    case ALTIVEC:
+      return chainback_viterbi27_av(p,data,nbits,endstate);
+#endif
+#ifdef __i386__
+    case MMX:
+      return chainback_viterbi27_mmx(p,data,nbits,endstate);
+    case SSE:
+      return chainback_viterbi27_sse(p,data,nbits,endstate);
+    case SSE2:
+      return chainback_viterbi27_sse2(p,data,nbits,endstate);
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      return chainback_viterbi27_port(p,data,nbits,endstate);
+#endif
+    }
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27(void *p){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      delete_viterbi27_port(p);
+      break;
+#ifdef __VEC__
+    case ALTIVEC:
+      delete_viterbi27_av(p);
+      break;
+#endif
+#ifdef __i386__
+    case MMX:
+      delete_viterbi27_mmx(p);
+      break;
+    case SSE:
+      delete_viterbi27_sse(p);
+      break;
+    case SSE2:
+      delete_viterbi27_sse2(p);
+      break;
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      delete_viterbi27_port(p);
+      break;
+#endif
+    }
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi27_blk(void *p,unsigned char syms[],int nbits){
+  if(p == NULL)
+    return -1;
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    update_viterbi27_blk_port(p,syms,nbits);
+    break;
+#ifdef __VEC__
+  case ALTIVEC:
+    update_viterbi27_blk_av(p,syms,nbits);
+    break;
+#endif
+#ifdef __i386__
+  case MMX:
+    update_viterbi27_blk_mmx(p,syms,nbits);
+    break;
+  case SSE:
+    update_viterbi27_blk_sse(p,syms,nbits);
+    break;
+  case SSE2:
+    update_viterbi27_blk_sse2(p,syms,nbits);
+    break;
+#endif
+#ifdef __x86_64__
+  case SSE2:
+    update_viterbi27_blk_port(p,syms,nbits);
+    break;
+#endif
+  }
+  return 0;
+}
diff --git a/libfec/viterbi27_av.c b/libfec/viterbi27_av.c
new file mode 100644
index 0000000..98d7344
--- /dev/null
+++ b/libfec/viterbi27_av.c
@@ -0,0 +1,210 @@
+/* K=7 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec instructions
+ * Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "fec.h"
+
+typedef union { long long p; unsigned char c[64]; vector bool char v[4]; } decision_t;
+typedef union { long long p; unsigned char c[64]; vector unsigned char v[4]; } metric_t;
+
+static union branchtab27 { unsigned char c[32]; vector unsigned char v[2];} Branchtab27[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s!
+ */
+struct v27 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_av(void *p,int starting_state){
+  struct v27 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<4;i++)
+    vp->metrics1.v[i] = (vector unsigned char)(63);
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi27_polynomial_av(int polys[2]){
+  int state;
+
+  for(state=0;state < 32;state++){
+    Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_av(int len){
+  struct v27 *vp;
+
+  if(!Init){
+    int polys[2] = { V27POLYA,V27POLYB };
+    set_viterbi27_polynomial_av(polys);
+  }
+  if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL)
+    return NULL;
+  if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi27_av(vp,0);
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27_av(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v27 *vp = p;
+  decision_t *d = (decision_t *)vp->decisions;
+
+  if(p == NULL)
+    return -1;
+
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 64;
+  endstate <<= 2;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 6; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = d[nbits].c[endstate>>2] & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_av(void *p){
+  struct v27 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+/* Process received symbols */
+int update_viterbi27_blk_av(void *p,unsigned char *syms,int nbits){
+  struct v27 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    vector unsigned char survivor0,survivor1,sym0v,sym1v;
+    vector bool char decision0,decision1;
+    vector unsigned char metric,m_metric,m0,m1,m2,m3;
+    void *tmp;
+
+    /* sym0v.0 = syms[0]; sym0v.1 = syms[1] */
+    sym0v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms));
+
+    sym1v = vec_splat(sym0v,1); /* Splat syms[1] across sym1v */
+    sym0v = vec_splat(sym0v,0); /* Splat syms[0] across sym0v */
+    syms += 2;
+
+    /* Do the 32 butterflies as two interleaved groups of 16 each to keep the pipes full */
+
+    /* Form first set of 16 branch metrics */
+    metric = vec_avg(vec_xor(Branchtab27[0].v[0],sym0v),vec_xor(Branchtab27[1].v[0],sym1v));
+    metric = vec_sr(metric,(vector unsigned char)(3));
+    m_metric = vec_sub((vector unsigned char)(31),metric);
+    
+    /* Form first set of path metrics */
+    m0 = vec_adds(vp->old_metrics->v[0],metric);
+    m3 = vec_adds(vp->old_metrics->v[2],metric);
+    m1 = vec_adds(vp->old_metrics->v[2],m_metric);
+    m2 = vec_adds(vp->old_metrics->v[0],m_metric);
+    
+    /* Form second set of 16 branch metrics */
+    metric = vec_avg(vec_xor(Branchtab27[0].v[1],sym0v),vec_xor(Branchtab27[1].v[1],sym1v));
+    metric = vec_sr(metric,(vector unsigned char)(3));
+    m_metric = vec_sub((vector unsigned char)(31),metric);
+
+    /* Compare and select first set */
+    decision0 = vec_cmpgt(m0,m1);
+    decision1 = vec_cmpgt(m2,m3);
+    survivor0 = vec_min(m0,m1);
+    survivor1 = vec_min(m2,m3);
+    
+    /* Compute second set of path metrics */
+    m0 = vec_adds(vp->old_metrics->v[1],metric);
+    m3 = vec_adds(vp->old_metrics->v[3],metric);
+    m1 = vec_adds(vp->old_metrics->v[3],m_metric);
+    m2 = vec_adds(vp->old_metrics->v[1],m_metric);
+
+    /* Interleave and store first decisions and survivors */
+    d->v[0] = vec_mergeh(decision0,decision1);
+    d->v[1] = vec_mergel(decision0,decision1);
+    vp->new_metrics->v[0] = vec_mergeh(survivor0,survivor1);
+    vp->new_metrics->v[1] = vec_mergel(survivor0,survivor1);
+    
+    /* Compare and select second set */
+    decision0 = vec_cmpgt(m0,m1);
+    decision1 = vec_cmpgt(m2,m3);
+    survivor0 = vec_min(m0,m1);
+    survivor1 = vec_min(m2,m3);
+
+    /* Interleave and store second set of decisions and survivors */
+    d->v[2] = vec_mergeh(decision0,decision1);
+    d->v[3] = vec_mergel(decision0,decision1);
+    vp->new_metrics->v[2] = vec_mergeh(survivor0,survivor1);
+    vp->new_metrics->v[3] = vec_mergel(survivor0,survivor1);
+   
+    /* renormalize if necessary */
+    if(vp->new_metrics->c[0] >= 105){
+      vector unsigned char scale0,scale1;
+
+      /* Find smallest metric and splat */
+      scale0 = vec_min(vp->new_metrics->v[0],vp->new_metrics->v[1]);
+      scale1 = vec_min(vp->new_metrics->v[2],vp->new_metrics->v[3]);
+      scale0 = vec_min(scale0,scale1);
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,8));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,4));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,2));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,1));
+
+      /* Now subtract from all metrics */
+      vp->new_metrics->v[0] = vec_subs(vp->new_metrics->v[0],scale0);
+      vp->new_metrics->v[1] = vec_subs(vp->new_metrics->v[1],scale0);
+      vp->new_metrics->v[2] = vec_subs(vp->new_metrics->v[2],scale0);
+      vp->new_metrics->v[3] = vec_subs(vp->new_metrics->v[3],scale0);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+
+  return 0;
+}
+
diff --git a/libfec/viterbi27_mmx.c b/libfec/viterbi27_mmx.c
new file mode 100644
index 0000000..a6d5125
--- /dev/null
+++ b/libfec/viterbi27_mmx.c
@@ -0,0 +1,115 @@
+/* K=7 r=1/2 Viterbi decoder for MMX
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <mmintrin.h>
+#include "fec.h"
+
+typedef union { char c[64]; __m64 v[8];} decision_t;
+typedef union { unsigned char c[64]; __m64 v[8];} metric_t;
+
+unsigned char Mettab27_1[256][32] __attribute__ ((aligned(16)));
+unsigned char Mettab27_2[256][32] __attribute__ ((aligned(16)));
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in mmxbfly27.s!
+ */
+struct v27 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_mmx(void *p,int starting_state){
+  struct v27 *vp = (struct v27 *)p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<64;i++)
+    vp->metrics1.c[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi27_polynomial_mmx(int polys[2]){
+  int state;
+
+  for(state=0;state < 32;state++){
+    int symbol;
+    for(symbol = 0;symbol < 256;symbol++){
+      int sym;
+
+      sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0);
+      Mettab27_1[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+
+      sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0);
+      Mettab27_2[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+    }
+  }
+  Init++;
+}
+
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_mmx(int len){
+  struct v27 *vp;
+  int polys[2] = { V27POLYA, V27POLYB };
+  
+  if(Init == 0){
+    set_viterbi27_polynomial_mmx(polys);
+  }
+  if((vp = (struct v27 *)malloc(sizeof(struct v27))) == NULL)
+    return NULL;
+
+  if((vp->decisions = (decision_t *)malloc((len+6)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi27_mmx(vp,0);
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27_mmx(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+  struct v27 *vp = (struct v27 *)p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->decisions;
+  endstate &= 63;
+  d += 6; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = d[nbits].c[endstate>>2] & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_mmx(void *p){
+  struct v27 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
diff --git a/libfec/viterbi27_port.c b/libfec/viterbi27_port.c
new file mode 100644
index 0000000..7cac2b3
--- /dev/null
+++ b/libfec/viterbi27_port.c
@@ -0,0 +1,191 @@
+/* K=7 r=1/2 Viterbi decoder in portable C
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+
+typedef union { unsigned int w[64]; } metric_t;
+typedef union { unsigned long w[2];} decision_t;
+static union branchtab27 { unsigned char c[32]; } Branchtab27[2] __attribute__ ((aligned(16)));
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s!
+ */
+struct v27 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_port(void *p,int starting_state){
+  struct v27 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<64;i++)
+    vp->metrics1.w[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->w[starting_state & 63] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi27_polynomial_port(int polys[2]){
+  int state;
+
+  for(state=0;state < 32;state++){
+    Branchtab27[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab27[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_port(int len){
+  struct v27 *vp;
+
+  if(!Init){
+    int polys[2] = { V27POLYA, V27POLYB };
+    set_viterbi27_polynomial_port(polys);
+  }
+  if((vp = malloc(sizeof(struct v27))) == NULL)
+     return NULL;
+  if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi27_port(vp,0);
+
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27_port(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v27 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 64;
+  endstate <<= 2;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 6; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].w[(endstate>>2)/32] >> ((endstate>>2)%32)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_port(void *p){
+  struct v27 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned int metric,m0,m1,decision;\
+    metric = (Branchtab27[0].c[i] ^ sym0) + (Branchtab27[1].c[i] ^ sym1);\
+    m0 = vp->old_metrics->w[i] + metric;\
+    m1 = vp->old_metrics->w[i+32] + (510 - metric);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i)&31);\
+    m0 -= (metric+metric-510);\
+    m1 += (metric+metric-510);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i+1)&31);\
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi27_blk_port(void *p,unsigned char *syms,int nbits){
+  struct v27 *vp = p;
+  void *tmp;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    unsigned char sym0,sym1;
+
+    d->w[0] = d->w[1] = 0;
+    sym0 = *syms++;
+    sym1 = *syms++;
+    
+    BFLY(0);
+    BFLY(1);
+    BFLY(2);
+    BFLY(3);
+    BFLY(4);
+    BFLY(5);
+    BFLY(6);
+    BFLY(7);
+    BFLY(8);
+    BFLY(9);
+    BFLY(10);
+    BFLY(11);
+    BFLY(12);
+    BFLY(13);
+    BFLY(14);
+    BFLY(15);
+    BFLY(16);
+    BFLY(17);
+    BFLY(18);
+    BFLY(19);
+    BFLY(20);
+    BFLY(21);
+    BFLY(22);
+    BFLY(23);
+    BFLY(24);
+    BFLY(25);
+    BFLY(26);
+    BFLY(27);
+    BFLY(28);
+    BFLY(29);
+    BFLY(30);
+    BFLY(31);
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }    
+  vp->dp = d;
+  return 0;
+}
diff --git a/libfec/viterbi27_sse.c b/libfec/viterbi27_sse.c
new file mode 100644
index 0000000..cd1f287
--- /dev/null
+++ b/libfec/viterbi27_sse.c
@@ -0,0 +1,113 @@
+/* K=7 r=1/2 Viterbi decoder for SSE
+ * Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <xmmintrin.h>
+#include "fec.h"
+
+typedef union { unsigned char c[64]; } metric_t;
+typedef union { unsigned long w[2]; unsigned char c[8]; __m64 v[1];} decision_t;
+union branchtab27 { unsigned char c[32]; __m64 v[4];} Branchtab27_sse[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in ssebfly27.s!
+ */
+struct v27 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_sse(int len){
+  struct v27 *vp;
+
+  if(!Init){
+    int polys[2] = { V27POLYA, V27POLYB };
+
+    set_viterbi27_polynomial_sse(polys);
+  }
+  if((vp = malloc(sizeof(struct v27))) == NULL)
+    return NULL;
+  if((vp->decisions = malloc((len+6)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi27(vp,0);
+  return vp;
+}
+
+void set_viterbi27_polynomial_sse(int polys[2]){
+  int state;
+
+  for(state=0;state < 32;state++){
+    Branchtab27_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab27_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_sse(void *p,int starting_state){
+  struct v27 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<64;i++)
+    vp->metrics1.c[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27_sse(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v27 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 64;
+  endstate <<= 2;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 6; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_sse(void *p){
+  struct v27 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
diff --git a/libfec/viterbi27_sse2.c b/libfec/viterbi27_sse2.c
new file mode 100644
index 0000000..bc01710
--- /dev/null
+++ b/libfec/viterbi27_sse2.c
@@ -0,0 +1,180 @@
+/* K=7 r=1/2 Viterbi decoder for SSE2
+ * Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <xmmintrin.h>
+#include "fec.h"
+
+typedef union { unsigned char c[64]; __m128i v[4]; } metric_t;
+typedef union { unsigned long w[2]; unsigned char c[8]; unsigned short s[4]; __m64 v[1];} decision_t;
+union branchtab27 { unsigned char c[32]; __m128i v[2];} Branchtab27_sse2[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in sse2bfly27.s!
+ */
+struct v27 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi27_sse2(void *p,int starting_state){
+  struct v27 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<64;i++)
+    vp->metrics1.c[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 63] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi27_polynomial_sse2(int polys[2]){
+  int state;
+
+  for(state=0;state < 32;state++){
+    Branchtab27_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab27_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi27_sse2(int len){
+  void *p;
+  struct v27 *vp;
+
+  if(!Init){
+    int polys[2] = { V27POLYA, V27POLYB };
+    set_viterbi27_polynomial_sse2(polys);
+  }
+  /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+  if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v27)))
+    return NULL;
+  vp = (struct v27 *)p;
+
+  if((p = malloc((len+6)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  vp->decisions = (decision_t *)p;
+  init_viterbi27_sse2(vp,0);
+
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi27_sse2(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v27 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 64;
+  endstate <<= 2;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 6; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].c[(endstate>>2)/8] >> ((endstate>>2)%8)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi27_sse2(void *p){
+  struct v27 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+#if 0
+/* This code is turned off because it's slower than my hand-crafted assembler in sse2bfly27.s. But it does work. */
+void update_viterbi27_blk_sse2(void *p,unsigned char *syms,int nbits){
+  struct v27 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    __m128i sym0v,sym1v;
+    void *tmp;
+    int i;
+    
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_epi8(syms[0]);
+    sym1v = _mm_set1_epi8(syms[1]);
+    syms += 2;
+
+    for(i=0;i<2;i++){
+      __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics */
+      metric = _mm_avg_epu8(_mm_xor_si128(Branchtab27_sse2[0].v[i],sym0v),_mm_xor_si128(Branchtab27_sse2[1].v[i],sym1v));
+      /* There's no packed bytes right shift in SSE2, so we use the word version and mask
+       * (I'm *really* starting to like Altivec...)
+       */
+      metric = _mm_srli_epi16(metric,3);
+      metric = _mm_and_si128(metric,_mm_set1_epi8(31));
+      m_metric = _mm_sub_epi8(_mm_set1_epi8(31),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_add_epi8(vp->old_metrics->v[i],metric);
+      m3 = _mm_add_epi8(vp->old_metrics->v[2+i],metric);
+      m1 = _mm_add_epi8(vp->old_metrics->v[2+i],m_metric);
+      m2 = _mm_add_epi8(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select, using modulo arithmetic */
+      decision0 = _mm_cmpgt_epi8(_mm_sub_epi8(m0,m1),_mm_setzero_si128());
+      decision1 = _mm_cmpgt_epi8(_mm_sub_epi8(m2,m3),_mm_setzero_si128());
+      survivor0 = _mm_or_si128(_mm_and_si128(decision0,m1),_mm_andnot_si128(decision0,m0));
+      survivor1 = _mm_or_si128(_mm_and_si128(decision1,m3),_mm_andnot_si128(decision1,m2));
+ 
+      /* Pack each set of decisions into 16 bits */
+      d->s[2*i] = _mm_movemask_epi8(_mm_unpacklo_epi8(decision0,decision1));
+      d->s[2*i+1] = _mm_movemask_epi8(_mm_unpackhi_epi8(decision0,decision1));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_epi8(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi8(survivor0,survivor1);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+}
+#endif
diff --git a/libfec/viterbi29.c b/libfec/viterbi29.c
new file mode 100644
index 0000000..f51e356
--- /dev/null
+++ b/libfec/viterbi29.c
@@ -0,0 +1,178 @@
+/* Switch to K=9 r=1/2 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29(int len){
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return create_viterbi29_port(len);
+#ifdef __VEC__
+  case ALTIVEC:
+    return create_viterbi29_av(len);
+#endif
+#ifdef __i386__
+  case MMX:
+    return create_viterbi29_mmx(len);
+  case SSE:
+    return create_viterbi29_sse(len);
+  case SSE2:
+    return create_viterbi29_sse2(len);
+#endif
+#ifdef __x86_64__
+  case SSE2:
+    return create_viterbi29_port(len);
+#endif
+  }
+}
+
+void set_viterbi29_polynomial(int polys[2]){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    set_viterbi29_polynomial_port(polys);
+    break;
+#ifdef __VEC__
+  case ALTIVEC:
+    set_viterbi29_polynomial_av(polys);
+    break;
+#endif
+#ifdef __i386__
+  case MMX:
+    set_viterbi29_polynomial_mmx(polys);
+    break;
+  case SSE:
+    set_viterbi29_polynomial_sse(polys);
+    break;
+  case SSE2:
+    set_viterbi29_polynomial_sse2(polys);
+    break;
+#endif
+#ifdef __x86_64__
+  case SSE2:
+    set_viterbi29_polynomial_port(polys);
+    break;
+#endif
+  }
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29(void *p,int starting_state){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return init_viterbi29_port(p,starting_state);
+#ifdef __VEC__
+    case ALTIVEC:
+      return init_viterbi29_av(p,starting_state);
+#endif
+#ifdef __i386__
+    case MMX:
+      return init_viterbi29_mmx(p,starting_state);
+    case SSE:
+      return init_viterbi29_sse(p,starting_state);
+    case SSE2:
+      return init_viterbi29_sse2(p,starting_state);
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      return init_viterbi29_port(p,starting_state);
+#endif
+    }
+}
+
+/* Viterbi chainback */
+int chainback_viterbi29(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return chainback_viterbi29_port(p,data,nbits,endstate);
+#ifdef __VEC__
+    case ALTIVEC:
+      return chainback_viterbi29_av(p,data,nbits,endstate);
+#endif
+#ifdef __i386__
+    case MMX:
+      return chainback_viterbi29_mmx(p,data,nbits,endstate);
+    case SSE:
+      return chainback_viterbi29_sse(p,data,nbits,endstate);
+    case SSE2:
+      return chainback_viterbi29_sse2(p,data,nbits,endstate);
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      return chainback_viterbi29_port(p,data,nbits,endstate);
+#endif
+    }
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29(void *p){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      delete_viterbi29_port(p);
+      break;
+#ifdef __VEC__
+    case ALTIVEC:
+      delete_viterbi29_av(p);
+      break;
+#endif
+#ifdef __i386__
+    case MMX:
+      delete_viterbi29_mmx(p);
+      break;
+    case SSE:
+      delete_viterbi29_sse(p);
+      break;
+    case SSE2:
+      delete_viterbi29_sse2(p);
+      break;
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      delete_viterbi29_port(p);
+      break;
+#endif
+    }
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi29_blk(void *p,unsigned char syms[],int nbits){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return update_viterbi29_blk_port(p,syms,nbits);
+#ifdef __VEC__
+    case ALTIVEC:
+      return update_viterbi29_blk_av(p,syms,nbits);
+#endif
+#ifdef __i386__
+    case MMX:
+      return update_viterbi29_blk_mmx(p,syms,nbits);
+    case SSE:
+      return update_viterbi29_blk_sse(p,syms,nbits);
+    case SSE2:
+      return update_viterbi29_blk_sse2(p,syms,nbits);
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      return update_viterbi29_blk_port(p,syms,nbits);
+#endif
+    }
+}
diff --git a/libfec/viterbi29_av.c b/libfec/viterbi29_av.c
new file mode 100644
index 0000000..31c8d27
--- /dev/null
+++ b/libfec/viterbi29_av.c
@@ -0,0 +1,190 @@
+/* K=9 r=1/2 Viterbi decoder for PowerPC G4/G5 Altivec
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <sys/sysctl.h>
+#include "fec.h"
+
+typedef union { unsigned char c[256]; vector bool char v[16]; } decision_t;
+typedef union { unsigned char c[256]; vector unsigned char v[16]; } metric_t;
+
+static union branchtab29 { unsigned char c[128]; vector unsigned char v[8]; } Branchtab29[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v29 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_av(void *p,int starting_state){
+  struct v29 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<16;i++)
+    vp->metrics1.v[i] = (vector unsigned char)(63);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi29_polynomial_av(int polys[2]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_av(int len){
+  struct v29 *vp;
+
+  if(!Init){
+    int polys[2] = { V29POLYA,V29POLYB };
+    set_viterbi29_polynomial_av(polys);
+  }
+  if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+    return NULL;
+  if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi29_av(vp,0);
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi29_av(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v29 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->decisions;  
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = d[nbits].c[endstate] & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_av(void *p){
+  struct v29 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi29_blk_av(void *p,unsigned char *syms,int nbits){
+  struct v29 *vp = p;
+  decision_t *d;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+
+  while(nbits--){
+    vector unsigned char sym1v,sym2v;
+    void *tmp;
+    
+    /* All this seems necessary just to load a byte into all elements of a vector! */
+    sym1v = vec_perm(vec_ld(0,syms),vec_ld(1,syms),vec_lvsl(0,syms)); /* sym1v.0 = syms[0]; sym1v.1 = syms[1] */
+    sym2v = vec_splat(sym1v,1); /* Splat syms[1] across sym2v */
+    sym1v = vec_splat(sym1v,0); /* Splat syms[0] across sym1v */
+    syms += 2;
+    
+    for(i=0;i<8;i++){
+      vector bool char decision0,decision1;
+      vector unsigned char metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics */
+      metric = vec_avg(vec_xor(Branchtab29[0].v[i],sym1v),vec_xor(Branchtab29[1].v[i],sym2v));
+      metric = vec_sr(metric,(vector unsigned char)(3));
+      m_metric = (vector unsigned char)(31) - metric;
+    
+      /* Add branch metrics to path metrics */
+      m0 = vec_adds(vp->old_metrics->v[i],metric);
+      m3 = vec_adds(vp->old_metrics->v[8+i],metric);
+      m1 = vec_adds(vp->old_metrics->v[8+i],m_metric);
+      m2 = vec_adds(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select first set */
+      decision0 = vec_cmpgt(m0,m1);
+      decision1 = vec_cmpgt(m2,m3);
+      survivor0 = vec_min(m0,m1);
+      survivor1 = vec_min(m2,m3);
+
+      /* Interleave and store decisions and survivors */
+      d->v[2*i] = vec_mergeh(decision0,decision1);
+      d->v[2*i+1] = vec_mergel(decision0,decision1);
+      vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1);
+    }
+    d++;
+    /* renormalize if necessary */
+    if(vp->new_metrics->c[0] >= 50){
+      int i;
+      vector unsigned char scale0,scale1;
+
+      /* Find smallest metric and splat */
+      scale0 = vp->new_metrics->v[0];
+      scale1 = vp->new_metrics->v[1];
+      for(i=2;i<16;i+=2){
+	scale0 = vec_min(scale0,vp->new_metrics->v[i]);
+	scale1 = vec_min(scale1,vp->new_metrics->v[i+1]);
+      }
+      scale0 = vec_min(scale0,scale1);
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,8));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,4));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,2));
+      scale0 = vec_min(scale0,vec_sld(scale0,scale0,1));
+
+      /* Now subtract from all metrics */
+      for(i=0;i<16;i++)
+	vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale0);
+    }
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  return 0;
+}
diff --git a/libfec/viterbi29_mmx.c b/libfec/viterbi29_mmx.c
new file mode 100644
index 0000000..563f40a
--- /dev/null
+++ b/libfec/viterbi29_mmx.c
@@ -0,0 +1,118 @@
+/* K=9 r=1/2 Viterbi decoder for MMX
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <mmintrin.h>
+#include "fec.h"
+
+typedef union { char c[256]; __m64 v[32];} decision_t;
+typedef union { unsigned char c[256]; __m64 v[32];} metric_t;
+
+unsigned char Mettab29_1[256][128] __attribute__ ((aligned(8)));
+unsigned char Mettab29_2[256][128] __attribute__ ((aligned(8)));
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in mmxbfly29.s!
+ */
+struct v29 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_mmx(int len){
+  struct v29 *vp;
+
+  if(Init == 0){
+    int polys[2] = {V29POLYA,V29POLYB};
+
+    set_viterbi29_polynomial_mmx(polys);
+  }
+  if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+    return NULL;
+
+  if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi29(vp,0);
+  return vp;
+}
+
+void set_viterbi29_polynomial_mmx(int polys[2]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    int symbol;
+
+    for(symbol = 0;symbol < 256;symbol++){
+      int sym;
+
+      sym = parity((2*state) & abs(polys[0])) ^ (polys[0] < 0);
+      Mettab29_1[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+
+      sym = parity((2*state) & abs(polys[1])) ^ (polys[1] < 0);
+      Mettab29_2[symbol][state] = (sym ? (255-symbol):symbol) / 16;
+    }
+  }
+  Init++;
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_mmx(void *p,int starting_state){
+  struct v29 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.c[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi29_mmx(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+  struct v29 *vp = (struct v29 *)p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = (decision_t *)vp->decisions;
+  endstate &= 255;
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = d[nbits].c[endstate] & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_mmx(void *p){
+  struct v29 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
diff --git a/libfec/viterbi29_port.c b/libfec/viterbi29_port.c
new file mode 100644
index 0000000..292dce8
--- /dev/null
+++ b/libfec/viterbi29_port.c
@@ -0,0 +1,166 @@
+/* K=9 r=1/2 Viterbi decoder in portable C
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+typedef union { unsigned int w[256]; } metric_t;
+typedef union { unsigned long w[8];} decision_t;
+
+static union { unsigned char c[128]; } Branchtab29[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v29 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_port(void *p,int starting_state){
+  struct v29 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.w[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi29_polynomial_port(int polys[2]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_port(int len){
+  struct v29 *vp;
+
+  if(!Init){
+    int polys[2] = {V29POLYA,V29POLYB};
+    set_viterbi29_polynomial_port(polys);
+  }
+  if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+    return NULL;
+
+  if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi29_port(vp,0);
+
+  return vp;
+}
+
+
+/* Viterbi chainback */
+int chainback_viterbi29_port(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v29 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_port(void *p){
+  struct v29 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned int metric,m0,m1,decision;\
+    metric = (Branchtab29[0].c[i] ^ sym0) + (Branchtab29[1].c[i] ^ sym1);\
+    m0 = vp->old_metrics->w[i] + metric;\
+    m1 = vp->old_metrics->w[i+128] + (510 - metric);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i)&31);\
+    m0 -= (metric+metric-510);\
+    m1 += (metric+metric-510);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i+1)&31);\
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+
+int update_viterbi29_blk_port(void *p,unsigned char *syms,int nbits){
+  struct v29 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    void *tmp;
+    unsigned char sym0,sym1;
+    int i;
+
+    for(i=0;i<8;i++)
+      d->w[i] = 0;
+    sym0 = *syms++;
+    sym1 = *syms++;
+    
+    for(i=0;i<128;i++)
+      BFLY(i);
+
+    d++;
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }  
+  vp->dp = d;
+  return 0;
+}
diff --git a/libfec/viterbi29_sse.c b/libfec/viterbi29_sse.c
new file mode 100644
index 0000000..4a92e5f
--- /dev/null
+++ b/libfec/viterbi29_sse.c
@@ -0,0 +1,114 @@
+/* K=9 r=1/2 Viterbi decoder for SSE
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <xmmintrin.h>
+#include "fec.h"
+
+typedef union { unsigned char w[256]; __m64 v[32];} metric_t;
+typedef union { unsigned long w[8]; unsigned char c[32]; __m64 v[4];} decision_t;
+
+union branchtab29 { unsigned char c[128]; } Branchtab29_sse[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in [mmx|sse|sse2]bfly29.s!
+ */
+struct v29 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_sse(int len){
+  struct v29 *vp;
+
+  if(!Init){
+    int polys[2] = { V29POLYA,V29POLYB };
+
+    set_viterbi29_polynomial_sse(polys);
+  }
+  if((vp = (struct v29 *)malloc(sizeof(struct v29))) == NULL)
+    return NULL;
+  if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi29(vp,0);
+  return vp;
+}
+
+void set_viterbi29_polynomial_sse(int polys[2]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab29_sse[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab29_sse[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_sse(void *p,int starting_state){
+  struct v29 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.w[i] = 200;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi29_sse(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v29 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_sse(void *p){
+  struct v29 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
diff --git a/libfec/viterbi29_sse2.c b/libfec/viterbi29_sse2.c
new file mode 100644
index 0000000..4c7336c
--- /dev/null
+++ b/libfec/viterbi29_sse2.c
@@ -0,0 +1,119 @@
+/* K=9 r=1/2 Viterbi decoder for SSE2
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <emmintrin.h>
+#include "fec.h"
+
+typedef union { unsigned char c[256]; __m128i v[16];} metric_t;
+typedef union { unsigned long w[8]; unsigned char c[32];} decision_t;
+
+union branchtab29 { unsigned char c[128]; } Branchtab29_sse2[2];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder
+ * Don't change this without also changing references in sse2bfly29.s!
+ */
+struct v29 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi29_sse2(void *p,int starting_state){
+  struct v29 *vp = p;
+  int i;
+
+  for(i=0;i<256;i++)
+    vp->metrics1.c[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->c[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi29_polynomial_sse2(int polys[2]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab29_sse2[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab29_sse2[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+  }
+  Init++;
+}
+
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi29_sse2(int len){
+  void *p;
+  struct v29 *vp;
+
+  if(!Init){
+    int polys[2] = {V29POLYA,V29POLYB};
+
+    set_viterbi29_polynomial(polys);
+  }
+  /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+  if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v29)))
+    return NULL;
+  vp = (struct v29 *)p;
+  if((p = malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  vp->decisions = (decision_t *)p;
+  init_viterbi29_sse2(vp,0);
+  return vp;
+}
+
+
+/* Viterbi chainback */
+int chainback_viterbi29_sse2(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v29 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = vp->decisions;
+
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi29_sse2(void *p){
+  struct v29 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
diff --git a/libfec/viterbi39.c b/libfec/viterbi39.c
new file mode 100644
index 0000000..d2e65f4
--- /dev/null
+++ b/libfec/viterbi39.c
@@ -0,0 +1,179 @@
+/* Switch to K=9 r=1/3 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39(int len){
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return create_viterbi39_port(len);
+#ifdef __VEC__
+  case ALTIVEC:
+    return create_viterbi39_av(len);
+#endif
+#ifdef __i386__
+  case MMX:
+    return create_viterbi39_mmx(len);
+  case SSE:
+    return create_viterbi39_sse(len);
+  case SSE2:
+    return create_viterbi39_sse2(len);
+#endif
+#ifdef __x86_64__
+  case SSE2:
+    return create_viterbi39_port(len);
+#endif
+  }
+}
+
+void set_viterbi39_polynomial(int polys[3]){
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    set_viterbi39_polynomial_port(polys);
+    break;
+#ifdef __VEC__
+  case ALTIVEC:
+    set_viterbi39_polynomial_av(polys);
+    break;
+#endif
+#ifdef __i386__
+  case MMX:
+    set_viterbi39_polynomial_mmx(polys);
+    break;
+  case SSE:
+    set_viterbi39_polynomial_sse(polys);
+    break;
+  case SSE2:
+    set_viterbi39_polynomial_sse2(polys);
+    break;
+#endif
+#ifdef __x86_64__
+  case SSE2:
+    set_viterbi39_polynomial_port(polys);
+    break;
+#endif
+  }
+}
+
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39(void *p,int starting_state){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return init_viterbi39_port(p,starting_state);
+#ifdef __VEC__
+    case ALTIVEC:
+      return init_viterbi39_av(p,starting_state);
+#endif
+#ifdef __i386__
+    case MMX:
+      return init_viterbi39_mmx(p,starting_state);
+    case SSE:
+      return init_viterbi39_sse(p,starting_state);
+    case SSE2:
+      return init_viterbi39_sse2(p,starting_state);
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      return init_viterbi39_port(p,starting_state);
+#endif
+    }
+}
+
+/* Viterbi chainback */
+int chainback_viterbi39(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return chainback_viterbi39_port(p,data,nbits,endstate);
+#ifdef __VEC__
+    case ALTIVEC:
+      return chainback_viterbi39_av(p,data,nbits,endstate);
+#endif
+#ifdef __i386__
+    case MMX:
+      return chainback_viterbi39_mmx(p,data,nbits,endstate);
+    case SSE:
+      return chainback_viterbi39_sse(p,data,nbits,endstate);
+    case SSE2:
+      return chainback_viterbi39_sse2(p,data,nbits,endstate);
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      return chainback_viterbi39_port(p,data,nbits,endstate);
+#endif
+    }
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39(void *p){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      delete_viterbi39_port(p);
+      break;
+#ifdef __VEC__
+    case ALTIVEC:
+      delete_viterbi39_av(p);
+      break;
+#endif
+#ifdef __i386__
+    case MMX:
+      delete_viterbi39_mmx(p);
+      break;
+    case SSE:
+      delete_viterbi39_sse(p);
+      break;
+    case SSE2:
+      delete_viterbi39_sse2(p);
+      break;
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      delete_viterbi39_port(p);
+      break;
+#endif
+    }
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi39_blk(void *p,unsigned char syms[],int nbits){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return update_viterbi39_blk_port(p,syms,nbits);
+#ifdef __VEC__
+    case ALTIVEC:
+      return update_viterbi39_blk_av(p,syms,nbits);
+#endif
+#ifdef __i386__
+    case MMX:
+      return update_viterbi39_blk_mmx(p,syms,nbits);
+    case SSE:
+      return update_viterbi39_blk_sse(p,syms,nbits);
+    case SSE2:
+      return update_viterbi39_blk_sse2(p,syms,nbits);
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      return update_viterbi39_blk_port(p,syms,nbits);
+#endif
+    }
+}
diff --git a/libfec/viterbi39_av.c b/libfec/viterbi39_av.c
new file mode 100644
index 0000000..2deed51
--- /dev/null
+++ b/libfec/viterbi39_av.c
@@ -0,0 +1,251 @@
+/* K=9 r=1/3 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions
+ * 8-bit offset-binary soft decision samples
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned char c[2][16]; vector unsigned char v[2]; } decision_t;
+typedef union { unsigned short s[256]; vector unsigned short v[32]; } metric_t;
+
+static union branchtab39 { unsigned short s[128]; vector unsigned short v[16];} Branchtab39[3];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v39 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_av(void *p,int starting_state){
+  struct v39 *vp = p;
+  int i;
+
+  for(i=0;i<32;i++)
+    vp->metrics1.v[i] = (vector unsigned short)(1000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi39_polynomial_av(int polys[3]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+    Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_av(int len){
+  struct v39 *vp;
+
+  if(!Init){
+    int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
+
+    set_viterbi39_polynomial_av(polys);
+  }
+  vp = (struct v39 *)malloc(sizeof(struct v39));
+  vp->decisions = malloc(sizeof(decision_t)*(len+8));
+  init_viterbi39_av(vp,0);
+  return vp;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi39_av(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v39 *vp = p;
+  decision_t *d = (decision_t *)vp->decisions;
+  int path_metric;
+
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  path_metric = vp->old_metrics->s[endstate];
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0;
+    endstate = (k << 7) | (endstate >> 1);
+    data[nbits>>3] = endstate;
+  }
+  return path_metric;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_av(void *p){
+  struct v39 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+int update_viterbi39_blk_av(void *p,unsigned char *syms,int nbits){
+  struct v39 *vp = p;
+  decision_t *d = (decision_t *)vp->dp;
+  int path_metric = 0;
+  vector unsigned char decisions = (vector unsigned char)(0);
+
+  while(nbits--){
+    vector unsigned short symv,sym0v,sym1v,sym2v;
+    vector unsigned char s;
+    void *tmp;
+    int i;
+    
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms));
+
+    symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s);    /* Unsigned byte->word unpack */ 
+    sym0v = vec_splat(symv,0);
+    sym1v = vec_splat(symv,1);
+    sym2v = vec_splat(symv,2);
+    syms += 3;
+    
+    for(i=0;i<16;i++){
+      vector bool short decision0,decision1;
+      vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * the metrics are in the range 0-765
+       */
+      m0 = vec_add(vec_xor(Branchtab39[0].v[i],sym0v),vec_xor(Branchtab39[1].v[i],sym1v));
+      m1 = vec_xor(Branchtab39[2].v[i],sym2v);
+      metric = vec_add(m0,m1);
+      m_metric = vec_sub((vector unsigned short)(765),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = vec_adds(vp->old_metrics->v[i],metric);
+      m3 = vec_adds(vp->old_metrics->v[16+i],metric);
+      m1 = vec_adds(vp->old_metrics->v[16+i],m_metric);
+      m2 = vec_adds(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      decision0 = vec_cmpgt(m0,m1);
+      decision1 = vec_cmpgt(m2,m3);
+      survivor0 = vec_min(m0,m1);
+      survivor1 = vec_min(m2,m3);
+    
+      /* Store decisions and survivors.
+       * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in
+       * a funny interleaved fashion that we undo in the chainback function.
+       */
+      decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */
+
+      /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting
+       * 0xff is equivalent to adding 1, which sets the lsb.
+       */
+      decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1)));
+
+      vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1);
+
+      if((i % 8) == 7){
+	/* We've accumulated a total of 128 decisions, stash and start again */
+	d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */
+      }
+    }
+#if 0
+    /* Experimentally determine metric spread
+     * The results are fixed for a given code and input symbol size
+     */
+    {
+      int i;
+      vector unsigned short min_metric;
+      vector unsigned short max_metric;
+      union { vector unsigned short v; unsigned short s[8];} t;
+      int minimum,maximum;
+      static int max_spread = 0;
+
+      min_metric = max_metric = vp->new_metrics->v[0];
+      for(i=1;i<32;i++){
+	min_metric = vec_min(min_metric,vp->new_metrics->v[i]);
+	max_metric = vec_max(max_metric,vp->new_metrics->v[i]);
+      }
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8));
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4));
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2));
+
+      t.v = min_metric;
+      minimum = t.s[0];
+      t.v = max_metric;
+      maximum = t.s[0];
+      if(maximum-minimum > max_spread){
+	max_spread = maximum-minimum;
+	printf("metric spread = %d\n",max_spread);
+      }
+    }
+#endif
+
+    /* Renormalize if necessary. This deserves some explanation.
+     * The maximum possible spread, found by experiment, for 8 bit symbols is about 3825
+     * So by looking at one arbitrary metric we can tell if any of them have possibly saturated.
+     * However, this is very conservative. Large spreads occur only at very high Eb/No, where
+     * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor.
+
+     * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric
+     * by not not normalizing when we should are extremely low. So either way, the risk to performance is small.
+
+     * All this is borne out by experiment.
+     */
+    if(vp->new_metrics->s[0] >= USHRT_MAX-5000){
+      vector unsigned short scale;
+      union { vector unsigned short v; unsigned short s[8];} t;
+      
+      /* Find smallest metric and splat */
+      scale = vp->new_metrics->v[0];
+      for(i=1;i<32;i++)
+	scale = vec_min(scale,vp->new_metrics->v[i]);
+
+      scale = vec_min(scale,vec_sld(scale,scale,8));
+      scale = vec_min(scale,vec_sld(scale,scale,4));
+      scale = vec_min(scale,vec_sld(scale,scale,2));
+
+      /* Subtract it from all metrics
+       * Work backwards to try to improve the cache hit ratio, assuming LRU
+       */
+      for(i=31;i>=0;i--)
+	vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale);
+      t.v = scale;
+      path_metric += t.s[0];
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  return path_metric;
+}
diff --git a/libfec/viterbi39_mmx.c b/libfec/viterbi39_mmx.c
new file mode 100644
index 0000000..875391a
--- /dev/null
+++ b/libfec/viterbi39_mmx.c
@@ -0,0 +1,185 @@
+/* K=9 r=1/3 Viterbi decoder for x86 MMX
+ * Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <mmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+typedef union { unsigned char c[256]; __m64 v[32];} decision_t;
+typedef union { unsigned short s[256]; __m64 v[64];} metric_t;
+
+static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v39 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_mmx(void *p,int starting_state){
+  struct v39 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.s[i] = 1000;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi39_polynomial_mmx(int polys[3]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
+    Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
+    Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_mmx(int len){
+  struct v39 *vp;
+
+  if(!Init){
+    int polys[3] = { V39POLYA,V39POLYB,V39POLYC };
+    set_viterbi39_polynomial_mmx(polys);
+  }
+  if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL)
+    return NULL;
+  if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi39_mmx(vp,0);
+  return vp;
+}
+
+
+
+/* Viterbi chainback */
+int chainback_viterbi39_mmx(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v39 *vp = p;
+  decision_t *d;
+  int path_metric;
+
+  if(p == NULL)
+    return -1;
+
+  d = (decision_t *)vp->decisions;
+  
+  endstate %= 256;
+
+  path_metric = vp->old_metrics->s[endstate];
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = d[nbits].c[endstate] & 1;
+    endstate = (k << 7) | (endstate >> 1);
+    data[nbits>>3] = endstate;
+  }
+  return path_metric;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_mmx(void *p){
+  struct v39 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi39_blk_mmx(void *p,unsigned char *syms,int nbits){
+  struct v39 *vp = p;
+  decision_t *d;
+  int path_metric = 0;
+
+  if(p == NULL)
+    return -1;
+
+  d = (decision_t *)vp->dp;
+  
+  while(nbits--){
+    __m64 sym0v,sym1v,sym2v;
+    void *tmp;
+    int i;
+    
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_pi16(syms[0]);
+    sym1v = _mm_set1_pi16(syms[1]);
+    sym2v = _mm_set1_pi16(syms[2]);
+    syms += 3;
+
+    for(i=0;i<32;i++){
+      __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-1530
+       */
+      m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v));
+      metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0);
+      m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_add_pi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_add_pi16(vp->old_metrics->v[32+i],metric);
+      m1 = _mm_add_pi16(vp->old_metrics->v[32+i],m_metric);
+      m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select
+       * There's no packed min instruction in MMX, so we use modulo arithmetic
+       * to form the decisions and then do the select the hard way
+       */
+      decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64());
+      decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64());
+      survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0));
+      survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2));
+ 
+      /* Merge decisions and store as bytes */
+      d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+    }
+    if(vp->new_metrics->s[0] < vp->old_metrics->s[0])
+      path_metric += 65536; /* Hack: wraparound probably occured */
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  _mm_empty();
+  return path_metric;
+}
diff --git a/libfec/viterbi39_port.c b/libfec/viterbi39_port.c
new file mode 100644
index 0000000..5685c90
--- /dev/null
+++ b/libfec/viterbi39_port.c
@@ -0,0 +1,168 @@
+/* K=9 r=1/3 Viterbi decoder in portable C
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+typedef union { unsigned int w[256]; } metric_t;
+typedef union { unsigned long w[8];} decision_t;
+
+static union { unsigned char c[128]; } Branchtab39[3];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v39 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_port(void *p,int starting_state){
+  struct v39 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.w[i] = 63;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */
+  return 0;
+}
+
+void set_viterbi39_polynomial_port(int polys[3]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab39[0].c[state] = (polys[0] < 0) ^ parity((2*state) & abs(polys[0])) ? 255 : 0;
+    Branchtab39[1].c[state] = (polys[1] < 0) ^ parity((2*state) & abs(polys[1])) ? 255 : 0;
+    Branchtab39[2].c[state] = (polys[2] < 0) ^ parity((2*state) & abs(polys[2])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_port(int len){
+  struct v39 *vp;
+
+  if(!Init){
+    int polys[3] = {V39POLYA,V39POLYB,V39POLYC};
+    set_viterbi39_polynomial_port(polys);
+  }
+  if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL)
+    return NULL;
+
+  if((vp->decisions = (decision_t *)malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi39_port(vp,0);
+
+  return vp;
+}
+
+
+/* Viterbi chainback */
+int chainback_viterbi39_port(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v39 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = vp->decisions;
+  /* Make room beyond the end of the encoder register so we can
+   * accumulate a full byte of decoded data
+   */
+  endstate %= 256;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].w[(endstate)/32] >> (endstate%32)) & 1;
+    data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
+  }
+  return 0;
+}
+
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_port(void *p){
+  struct v39 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned int metric,m0,m1,decision;\
+    metric = (Branchtab39[0].c[i] ^ sym0) + (Branchtab39[1].c[i] ^ sym1) + \
+     (Branchtab39[2].c[i] ^ sym2);\
+    m0 = vp->old_metrics->w[i] + metric;\
+    m1 = vp->old_metrics->w[i+128] + (765 - metric);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i)&31);\
+    m0 -= (metric+metric-765);\
+    m1 += (metric+metric-765);\
+    decision = (signed int)(m0-m1) > 0;\
+    vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\
+    d->w[i/16] |= decision << ((2*i+1)&31);\
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+
+int update_viterbi39_blk_port(void *p,unsigned char *syms,int nbits){
+  struct v39 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    void *tmp;
+    unsigned char sym0,sym1,sym2;
+    int i;
+
+    for(i=0;i<8;i++)
+      d->w[i] = 0;
+    sym0 = *syms++;
+    sym1 = *syms++;
+    sym2 = *syms++;
+
+    for(i=0;i<128;i++)
+      BFLY(i);
+
+    d++;
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }  
+  vp->dp = d;
+  return 0;
+}
diff --git a/libfec/viterbi39_sse.c b/libfec/viterbi39_sse.c
new file mode 100644
index 0000000..c2f2865
--- /dev/null
+++ b/libfec/viterbi39_sse.c
@@ -0,0 +1,201 @@
+/* K=9 r=1/3 Viterbi decoder for x86 SSE
+ * Copyright Aug 2006, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <xmmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned long w[8]; unsigned char c[32];} decision_t;
+typedef union { signed short s[256]; __m64 v[64];} metric_t;
+
+static union branchtab39 { unsigned short s[128]; __m64 v[32];} Branchtab39[3];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v39 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_sse(void *p,int starting_state){
+  struct v39 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<256;i++)
+    vp->metrics1.s[i] = (SHRT_MIN+1000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_sse(int len){
+  struct v39 *vp;
+
+  if(!Init){
+    int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
+
+    set_viterbi39_polynomial_sse(polys);
+  }
+  if((vp = (struct v39 *)malloc(sizeof(struct v39))) == NULL){
+    return NULL;
+  }
+  if((vp->decisions = malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi39_sse(vp,0);
+  return vp;
+}
+
+void set_viterbi39_polynomial_sse(int polys[3]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
+    Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
+    Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
+  }
+  Init++;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi39_sse(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v39 *vp = p;
+  decision_t *d;
+  int path_metric;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->decisions;  
+  endstate %= 256;
+
+  path_metric = vp->old_metrics->s[endstate];
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    /*    k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/
+    k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+    endstate = (k << 7) | (endstate >> 1);
+    data[nbits>>3] = endstate;
+  }
+  return path_metric - SHRT_MIN;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_sse(void *p){
+  struct v39 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi39_blk_sse(void *p,unsigned char *syms,int nbits){
+  struct v39 *vp = p;
+  decision_t *d;
+  int path_metric = 0;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    __m64 sym0v,sym1v,sym2v;
+    void *tmp;
+    int i;
+
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_pi16(syms[0]);
+    sym1v = _mm_set1_pi16(syms[1]);
+    sym2v = _mm_set1_pi16(syms[2]);
+    syms += 3;
+
+    for(i=0;i<32;i++){
+      __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-765
+       */
+      m0 = _mm_add_pi16(_mm_xor_si64(Branchtab39[0].v[i],sym0v),_mm_xor_si64(Branchtab39[1].v[i],sym1v));
+      metric = _mm_add_pi16(_mm_xor_si64(Branchtab39[2].v[i],sym2v),m0);
+      m_metric = _mm_sub_pi16(_mm_set1_pi16(765),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_adds_pi16(vp->old_metrics->v[32+i],metric);
+      m1 = _mm_adds_pi16(vp->old_metrics->v[32+i],m_metric);
+      m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      survivor0 = _mm_min_pi16(m0,m1);
+      survivor1 = _mm_min_pi16(m2,m3);
+      decision0 = _mm_cmpeq_pi16(survivor0,m1);
+      decision1 = _mm_cmpeq_pi16(survivor1,m3);
+ 
+      /* Pack decisions into 8 bits and store */
+      d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+    }
+    /* See if we need to renormalize
+     * Max metric spread for this code with 0-255 branch metrics is 12750
+     */
+    if(vp->new_metrics->s[0] >= SHRT_MAX-5000){
+      int i,adjust;
+      __m64 adjustv;
+      union { __m64 v; signed short w[4]; } t;
+
+      /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+      adjustv = vp->new_metrics->v[0];
+      for(i=1;i<64;i++)
+	adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]);
+
+      adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32));
+      adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16));    
+      t.v = adjustv;
+      adjust = t.w[0] - SHRT_MIN;
+      path_metric += adjust;
+      adjustv = _mm_set1_pi16(adjust);
+      
+      for(i=0;i<64;i++)
+	vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  _mm_empty();
+  return path_metric;
+}
diff --git a/libfec/viterbi39_sse2.c b/libfec/viterbi39_sse2.c
new file mode 100644
index 0000000..f13794e
--- /dev/null
+++ b/libfec/viterbi39_sse2.c
@@ -0,0 +1,200 @@
+/* K=15 r=1/6 Viterbi decoder for x86 SSE2
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <emmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned long w[8]; unsigned short s[16];} decision_t;
+typedef union { signed short s[256]; __m128i v[32];} metric_t;
+
+static union branchtab39 { unsigned short s[128]; __m128i v[16];} Branchtab39[3];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v39 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi39_sse2(void *p,int starting_state){
+  struct v39 *vp = p;
+  int i;
+
+  for(i=0;i<256;i++)
+    vp->metrics1.s[i] = (SHRT_MIN+1000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 255] = SHRT_MIN; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi39_sse2(int len){
+  void *p;
+  struct v39 *vp;
+
+  if(!Init){
+    int polys[3] = { V39POLYA, V39POLYB, V39POLYC };
+
+    set_viterbi39_polynomial_sse2(polys);
+  }
+  /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+  if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v39)))
+    return NULL;
+
+  vp = (struct v39 *)p;
+  if((p = malloc((len+8)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  vp->decisions = (decision_t *)p;
+  init_viterbi39_sse2(vp,0);
+  return vp;
+}
+
+void set_viterbi39_polynomial_sse2(int polys[3]){
+  int state;
+
+  for(state=0;state < 128;state++){
+    Branchtab39[0].s[state] = (polys[0] < 0) ^ parity((2*state) & polys[0]) ? 255:0;
+    Branchtab39[1].s[state] = (polys[1] < 0) ^ parity((2*state) & polys[1]) ? 255:0;
+    Branchtab39[2].s[state] = (polys[2] < 0) ^ parity((2*state) & polys[2]) ? 255:0;
+  }
+  Init++;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi39_sse2(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v39 *vp = p;
+  decision_t *d = (decision_t *)vp->decisions;
+  int path_metric;
+
+  endstate %= 256;
+
+  path_metric = vp->old_metrics->s[endstate];
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 8; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;
+    endstate = (k << 7) | (endstate >> 1);
+    data[nbits>>3] = endstate;
+  }
+  return path_metric;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi39_sse2(void *p){
+  struct v39 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi39_blk_sse2(void *p,unsigned char *syms,int nbits){
+  struct v39 *vp = p;
+  decision_t *d = (decision_t *)vp->dp;
+  int path_metric = 0;
+
+  while(nbits--){
+    __m128i sym0v,sym1v,sym2v;
+    void *tmp;
+    int i;
+
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_epi16(syms[0]);
+    sym1v = _mm_set1_epi16(syms[1]);
+    sym2v = _mm_set1_epi16(syms[2]);
+    syms += 3;
+
+    /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */
+    for(i=0;i<16;i++){
+      __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-765
+       */
+      m0 = _mm_add_epi16(_mm_xor_si128(Branchtab39[0].v[i],sym0v),_mm_xor_si128(Branchtab39[1].v[i],sym1v));
+      metric = _mm_add_epi16(_mm_xor_si128(Branchtab39[2].v[i],sym2v),m0);
+      m_metric = _mm_sub_epi16(_mm_set1_epi16(765),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_adds_epi16(vp->old_metrics->v[16+i],metric);
+      m1 = _mm_adds_epi16(vp->old_metrics->v[16+i],m_metric);
+      m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      survivor0 = _mm_min_epi16(m0,m1);
+      survivor1 = _mm_min_epi16(m2,m3);
+      decision0 = _mm_cmpeq_epi16(survivor0,m1);
+      decision1 = _mm_cmpeq_epi16(survivor1,m3);
+ 
+      /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */
+      d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128())));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1);
+    }
+    /* See if we need to renormalize */
+    if(vp->new_metrics->s[0] >= SHRT_MAX-5000){
+      int i,adjust;
+      __m128i adjustv;
+      union { __m128i v; signed short w[8]; } t;
+      
+      /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+      adjustv = vp->new_metrics->v[0];
+      for(i=1;i<32;i++)
+	adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]);
+
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8));
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4));
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2));
+      t.v = adjustv;
+      adjust = t.w[0] - SHRT_MIN;
+      path_metric += adjust;
+      adjustv = _mm_set1_epi16(adjust);
+
+      /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX
+       * This is okay since it can't overflow anyway
+       */
+      for(i=0;i<32;i++)
+	vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  return path_metric;
+}
+
+
diff --git a/libfec/viterbi615.c b/libfec/viterbi615.c
new file mode 100644
index 0000000..ec2fb3c
--- /dev/null
+++ b/libfec/viterbi615.c
@@ -0,0 +1,181 @@
+/* K=15 r=1/6 Viterbi decoder with optional Intel or PowerPC SIMD
+ * Copyright Feb 2004, Phil Karn, KA9Q
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615(int len){
+
+  find_cpu_mode();
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    return create_viterbi615_port(len);
+#ifdef __VEC__
+  case ALTIVEC:
+    return create_viterbi615_av(len);
+#endif
+#ifdef __i386__
+  case MMX:
+    return create_viterbi615_mmx(len);
+  case SSE:
+    return create_viterbi615_sse(len);
+  case SSE2:
+    return create_viterbi615_sse2(len);
+#endif
+#ifdef __x86_64__
+  case SSE2:
+    return create_viterbi615_port(len);
+#endif
+  }
+}
+
+void set_viterbi615_polynomial(int polys[6]){
+
+  switch(Cpu_mode){
+  case PORT:
+  default:
+    set_viterbi615_polynomial_port(polys);
+    break;
+#ifdef __VEC__
+  case ALTIVEC:
+    set_viterbi615_polynomial_av(polys);
+    break;
+#endif
+#ifdef __i386__
+  case MMX:
+    set_viterbi615_polynomial_mmx(polys);
+    break;
+  case SSE:
+    set_viterbi615_polynomial_sse(polys);
+    break;
+  case SSE2:
+    set_viterbi615_polynomial_sse2(polys);
+    break;
+#endif
+#ifdef __x86_64__
+  case SSE2:
+    set_viterbi615_polynomial_port(polys);
+    break;
+#endif
+  }
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615(void *p,int starting_state){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return init_viterbi615_port(p,starting_state);
+#ifdef __VEC__
+    case ALTIVEC:
+      return init_viterbi615_av(p,starting_state);
+#endif
+#ifdef __i386__
+    case MMX:
+      return init_viterbi615_mmx(p,starting_state);
+    case SSE:
+      return init_viterbi615_sse(p,starting_state);
+    case SSE2:
+      return init_viterbi615_sse2(p,starting_state);
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      return init_viterbi615_port(p,starting_state);
+#endif
+    }
+}
+
+/* Viterbi chainback */
+int chainback_viterbi615(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return chainback_viterbi615_port(p,data,nbits,endstate);
+#ifdef __VEC__
+    case ALTIVEC:
+      return chainback_viterbi615_av(p,data,nbits,endstate);
+#endif
+#ifdef __i386__
+    case MMX:
+      return chainback_viterbi615_mmx(p,data,nbits,endstate);
+    case SSE:
+      return chainback_viterbi615_sse(p,data,nbits,endstate);
+    case SSE2:
+      return chainback_viterbi615_sse2(p,data,nbits,endstate);
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      return chainback_viterbi615_port(p,data,nbits,endstate);
+#endif
+    }
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615(void *p){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      delete_viterbi615_port(p);
+      break;
+#ifdef __VEC__
+    case ALTIVEC:
+      delete_viterbi615_av(p);
+      break;
+#endif
+#ifdef __i386__
+    case MMX:
+      delete_viterbi615_mmx(p);
+      break;
+    case SSE:
+      delete_viterbi615_sse(p);
+      break;
+    case SSE2:
+      delete_viterbi615_sse2(p);
+      break;
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      delete_viterbi615_port(p);
+      break;
+#endif
+    }
+}
+
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+int update_viterbi615_blk(void *p,unsigned char syms[],int nbits){
+    switch(Cpu_mode){
+    case PORT:
+    default:
+      return update_viterbi615_blk_port(p,syms,nbits);
+#ifdef __VEC__
+    case ALTIVEC:
+      return update_viterbi615_blk_av(p,syms,nbits);
+#endif
+#ifdef __i386__
+    case MMX:
+      return update_viterbi615_blk_mmx(p,syms,nbits);
+    case SSE:
+      return update_viterbi615_blk_sse(p,syms,nbits);
+    case SSE2:
+      return update_viterbi615_blk_sse2(p,syms,nbits);
+#endif
+#ifdef __x86_64__
+    case SSE2:
+      return update_viterbi615_blk_port(p,syms,nbits);
+#endif
+    }
+}
+
diff --git a/libfec/viterbi615_av.c b/libfec/viterbi615_av.c
new file mode 100644
index 0000000..4a6ce9c
--- /dev/null
+++ b/libfec/viterbi615_av.c
@@ -0,0 +1,257 @@
+/* K=15 r=1/6 Viterbi decoder for PowerPC G4/G5 Altivec vector instructions
+ * 8-bit offset-binary soft decision samples
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned char c[128][16]; vector unsigned char v[128]; } decision_t;
+typedef union { unsigned short s[16384]; vector unsigned short v[2048]; } metric_t;
+
+static union branchtab615 { unsigned short s[8192]; vector unsigned short v[1024];} Branchtab615[6];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v615 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_av(void *p,int starting_state){
+  struct v615 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+
+  for(i=0;i<2048;i++)
+    vp->metrics1.v[i] = (vector unsigned short)(5000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_av(int len){
+  struct v615 *vp;
+
+  if(!Init){
+    int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+    set_viterbi615_polynomial_av(polys);
+  }
+  vp = (struct v615 *)malloc(sizeof(struct v615));
+  vp->decisions = malloc(sizeof(decision_t)*(len+14));
+  init_viterbi615_av(vp,0);
+  return vp;
+}
+
+void set_viterbi615_polynomial_av(int polys[6]){
+  int state;
+  int i;
+
+  for(state=0;state < 8192;state++){
+    for(i=0;i<6;i++)
+      Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+  }
+  Init++;
+}
+
+
+/* Viterbi chainback */
+int chainback_viterbi615_av(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v615 *vp = p;
+  decision_t *d = (decision_t *)vp->decisions;
+  int path_metric;
+
+  endstate %= 16384;
+
+  path_metric = vp->old_metrics->s[endstate];
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 14; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+    
+    k = (d[nbits].c[endstate >> 7][endstate & 15] & (0x80 >> ((endstate>>4)&7)) ) ? 1 : 0;
+    endstate = (k << 13) | (endstate >> 1);
+    data[nbits>>3] = endstate >> 6;
+  }
+  return path_metric;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_av(void *p){
+  struct v615 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+int update_viterbi615_blk_av(void *p,unsigned char *syms,int nbits){
+  struct v615 *vp = p;
+  decision_t *d = (decision_t *)vp->dp;
+  int path_metric = 0;
+  vector unsigned char decisions = (vector unsigned char)(0);
+
+  while(nbits--){
+    vector unsigned short symv,sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+    vector unsigned char s;
+    void *tmp;
+    int i;
+    
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    s = (vector unsigned char)vec_perm(vec_ld(0,syms),vec_ld(5,syms),vec_lvsl(0,syms));
+
+    symv = (vector unsigned short)vec_mergeh((vector unsigned char)(0),s);    /* Unsigned byte->word unpack */ 
+    sym0v = vec_splat(symv,0);
+    sym1v = vec_splat(symv,1);
+    sym2v = vec_splat(symv,2);
+    sym3v = vec_splat(symv,3);
+    sym4v = vec_splat(symv,4);
+    sym5v = vec_splat(symv,5);
+    syms += 6;
+    
+    for(i=0;i<1024;i++){
+      vector bool short decision0,decision1;
+      vector unsigned short metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-1530
+       */
+      m0 = vec_add(vec_xor(Branchtab615[0].v[i],sym0v),vec_xor(Branchtab615[1].v[i],sym1v));
+      m1 = vec_add(vec_xor(Branchtab615[2].v[i],sym2v),vec_xor(Branchtab615[3].v[i],sym3v));
+      m2 = vec_add(vec_xor(Branchtab615[4].v[i],sym4v),vec_xor(Branchtab615[5].v[i],sym5v));
+      metric = vec_add(m0,m1);
+      metric = vec_add(metric,m2);
+      m_metric = vec_sub((vector unsigned short)(1530),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = vec_adds(vp->old_metrics->v[i],metric);
+      m3 = vec_adds(vp->old_metrics->v[1024+i],metric);
+      m1 = vec_adds(vp->old_metrics->v[1024+i],m_metric);
+      m2 = vec_adds(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      decision0 = vec_cmpgt(m0,m1);
+      decision1 = vec_cmpgt(m2,m3);
+      survivor0 = vec_min(m0,m1);
+      survivor1 = vec_min(m2,m3);
+    
+      /* Store decisions and survivors.
+       * To save space without SSE2's handy PMOVMSKB instruction, we pack and store them in
+       * a funny interleaved fashion that we undo in the chainback function.
+       */
+      decisions = vec_add(decisions,decisions); /* Shift each byte 1 bit to the left */
+
+      /* Booleans are either 0xff or 0x00. Subtracting 0x00 leaves the lsb zero; subtracting
+       * 0xff is equivalent to adding 1, which sets the lsb.
+       */
+      decisions = vec_sub(decisions,(vector unsigned char)vec_pack(vec_mergeh(decision0,decision1),vec_mergel(decision0,decision1)));
+
+      vp->new_metrics->v[2*i] = vec_mergeh(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = vec_mergel(survivor0,survivor1);
+
+      if((i % 8) == 7){
+	/* We've accumulated a total of 128 decisions, stash and start again */
+	d->v[i>>3] = decisions; /* No need to clear, the new bits will replace the old */
+      }
+    }
+#if 0
+    /* Experimentally determine metric spread
+     * The results are fixed for a given code and input symbol size
+     */
+    {
+      int i;
+      vector unsigned short min_metric;
+      vector unsigned short max_metric;
+      union { vector unsigned short v; unsigned short s[8];} t;
+      int minimum,maximum;
+      static int max_spread = 0;
+
+      min_metric = max_metric = vp->new_metrics->v[0];
+      for(i=1;i<2048;i++){
+	min_metric = vec_min(min_metric,vp->new_metrics->v[i]);
+	max_metric = vec_max(max_metric,vp->new_metrics->v[i]);
+      }
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,8));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,8));
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,4));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,4));
+      min_metric = vec_min(min_metric,vec_sld(min_metric,min_metric,2));
+      max_metric = vec_max(max_metric,vec_sld(max_metric,max_metric,2));
+
+      t.v = min_metric;
+      minimum = t.s[0];
+      t.v = max_metric;
+      maximum = t.s[0];
+      if(maximum-minimum > max_spread){
+	max_spread = maximum-minimum;
+	printf("metric spread = %d\n",max_spread);
+      }
+    }
+#endif
+
+    /* Renormalize if necessary. This deserves some explanation.
+
+     * The maximum possible spread, found by experiment, for 4-bit symbols is 405; for 8 bit symbols, it's 12750.
+     * So by looking at one arbitrary metric we can tell if any of them have possibly saturated.
+     * However, this is very conservative. Large spreads occur only at very high Eb/No, where
+     * saturating a bad path metric doesn't do much to increase its chances of being erroneously chosen as a survivor.
+
+     * At more interesting (low) Eb/No ratios, the spreads are much smaller so our chances of saturating a metric
+     * by not not normalizing when we should are extremely low. So either way, the risk to performance is small.
+
+     * All this is borne out by experiment.
+     */
+    if(vp->new_metrics->s[0] >= USHRT_MAX-12750){
+      vector unsigned short scale;
+      union { vector unsigned short v; unsigned short s[8];} t;
+      
+      /* Find smallest metric and splat */
+      scale = vp->new_metrics->v[0];
+      for(i=1;i<2048;i++)
+	scale = vec_min(scale,vp->new_metrics->v[i]);
+
+      scale = vec_min(scale,vec_sld(scale,scale,8));
+      scale = vec_min(scale,vec_sld(scale,scale,4));
+      scale = vec_min(scale,vec_sld(scale,scale,2));
+
+      /* Subtract it from all metrics
+       * Work backwards to try to improve the cache hit ratio, assuming LRU
+       */
+      for(i=2047;i>=0;i--)
+	vp->new_metrics->v[i] = vec_subs(vp->new_metrics->v[i],scale);
+      t.v = scale;
+      path_metric += t.s[0];
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  return path_metric;
+}
diff --git a/libfec/viterbi615_mmx.c b/libfec/viterbi615_mmx.c
new file mode 100644
index 0000000..89a56f7
--- /dev/null
+++ b/libfec/viterbi615_mmx.c
@@ -0,0 +1,183 @@
+/* K=15 r=1/6 Viterbi decoder for x86 MMX
+ * Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <mmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "fec.h"
+
+typedef union { unsigned char c[16384]; __m64 v[2048];} decision_t;
+typedef union { unsigned short s[16384]; __m64 v[4096];} metric_t;
+
+static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v615 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_mmx(void *p,int starting_state){
+  struct v615 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<16384;i++)
+    vp->metrics1.s[i] = 5000;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 16383] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_mmx(int len){
+  struct v615 *vp;
+
+  if(!Init){
+    int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+    set_viterbi615_polynomial_mmx(polys);
+  }
+
+  if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL)
+    return NULL;
+  if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi615_mmx(vp,0);
+  return vp;
+}
+
+void set_viterbi615_polynomial_mmx(int polys[6]){
+  int state;
+  int i;
+
+  for(state=0;state < 8192;state++){
+    for(i=0;i<6;i++)
+      Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi615_mmx(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v615 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = (decision_t *)vp->decisions;
+  
+  endstate %= 16384;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 14; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = d[nbits].c[endstate] & 1;
+    endstate = (k << 13) | (endstate >> 1);
+    data[nbits>>3] = endstate >> 6;
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_mmx(void *p){
+  struct v615 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi615_blk_mmx(void *p,unsigned char *syms,int nbits){
+  struct v615 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+
+  d = (decision_t *)vp->dp;
+  
+  while(nbits--){
+    __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+    void *tmp;
+    int i;
+    
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_pi16(syms[0]);
+    sym1v = _mm_set1_pi16(syms[1]);
+    sym2v = _mm_set1_pi16(syms[2]);
+    sym3v = _mm_set1_pi16(syms[3]);
+    sym4v = _mm_set1_pi16(syms[4]);
+    sym5v = _mm_set1_pi16(syms[5]);
+    syms += 6;
+
+    for(i=0;i<2048;i++){
+      __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-1530
+       */
+      m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v));
+      m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v));
+      m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v));
+      metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2));
+      m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_add_pi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_add_pi16(vp->old_metrics->v[2048+i],metric);
+      m1 = _mm_add_pi16(vp->old_metrics->v[2048+i],m_metric);
+      m2 = _mm_add_pi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select
+       * There's no packed min instruction in MMX, so we use modulo arithmetic
+       * to form the decisions and then do the select the hard way
+       */
+      decision0 = _mm_cmpgt_pi16(_mm_sub_pi16(m0,m1),_mm_setzero_si64());
+      decision1 = _mm_cmpgt_pi16(_mm_sub_pi16(m2,m3),_mm_setzero_si64());
+      survivor0 = _mm_or_si64(_mm_and_si64(decision0,m1),_mm_andnot_si64(decision0,m0));
+      survivor1 = _mm_or_si64(_mm_and_si64(decision1,m3),_mm_andnot_si64(decision1,m2));
+ 
+      /* Merge decisions and store as bytes */
+      d->v[i] = _mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64()));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  _mm_empty();
+  return 0;
+}
diff --git a/libfec/viterbi615_port.c b/libfec/viterbi615_port.c
new file mode 100644
index 0000000..89bdd80
--- /dev/null
+++ b/libfec/viterbi615_port.c
@@ -0,0 +1,156 @@
+/* K=15 r=1/6 Viterbi decoder in portable C
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t;
+typedef union { unsigned long w[16384]; } metric_t;
+
+static union branchtab615 { unsigned long w[8192]; } Branchtab615[6] __attribute__ ((aligned(16)));
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v615 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  decision_t *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  decision_t *decisions;   /* Beginning of decisions for block */
+};
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_port(int len){
+  struct v615 *vp;
+
+  if(!Init){
+    int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+    set_viterbi615_polynomial_port(polys);
+  }
+  if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL)
+    return NULL;
+  if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi615(vp,0);
+  return vp;
+}
+
+void set_viterbi615_polynomial_port(int polys[6]){
+  int state;
+  int i;
+
+  for(state=0;state < 8192;state++){
+    for(i=0;i<6;i++)
+      Branchtab615[i].w[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_port(void *p,int starting_state){
+  struct v615 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<16384;i++)
+    vp->metrics1.w[i] = 1000;
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->w[starting_state & 16383] = 0; /* Bias known start state */
+  return 0;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi615_port(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v615 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->decisions;  
+  endstate %= 16384;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 14; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+    endstate = (k << 13) | (endstate >> 1);
+    data[nbits>>3] = endstate >> 6;
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_port(void *p){
+  struct v615 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+/* C-language butterfly */
+#define BFLY(i) {\
+unsigned long metric,m0,m1,m2,m3,decision0,decision1;\
+    metric = ((Branchtab615[0].w[i] ^ syms[0]) + (Branchtab615[1].w[i] ^ syms[1])\
+	      +(Branchtab615[2].w[i] ^ syms[2]) + (Branchtab615[3].w[i] ^ syms[3])\
+	      +(Branchtab615[4].w[i] ^ syms[4]) + (Branchtab615[5].w[i] ^ syms[5]));\
+    m0 = vp->old_metrics->w[i] + metric;\
+    m1 = vp->old_metrics->w[i+8192] + (1530 - metric);\
+    m2 = vp->old_metrics->w[i] + (1530-metric);\
+    m3 = vp->old_metrics->w[i+8192] + metric;\
+    decision0 = (signed long)(m0-m1) >= 0;\
+    decision1 = (signed long)(m2-m3) >= 0;\
+    vp->new_metrics->w[2*i] = decision0 ? m1 : m0;\
+    vp->new_metrics->w[2*i+1] = decision1 ? m3 : m2;\
+    d->c[i/4] |= ((decision0|(decision1<<1)) << ((2*i)&7));\
+}
+/* Update decoder with a block of demodulated symbols
+ * Note that nbits is the number of decoded data bits, not the number
+ * of symbols!
+ */
+
+int update_viterbi615_blk_port(void *p,unsigned char *syms,int nbits){
+  struct v615 *vp = p;
+  void *tmp;
+  decision_t *d;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    memset(d,0,sizeof(decision_t));
+    for(i=0;i<8192;i++)
+      BFLY(i);
+
+    syms += 6;
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }    
+  vp->dp = d;
+  return 0;
+}
+
diff --git a/libfec/viterbi615_sse.c b/libfec/viterbi615_sse.c
new file mode 100644
index 0000000..de0f8af
--- /dev/null
+++ b/libfec/viterbi615_sse.c
@@ -0,0 +1,201 @@
+/* K=15 r=1/6 Viterbi decoder for x86 SSE
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <xmmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned long w[512]; unsigned char c[2048];} decision_t;
+typedef union { signed short s[16384]; __m64 v[4096];} metric_t;
+
+static union branchtab615 { unsigned short s[8192]; __m64 v[2048];} Branchtab615[6];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v615 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_sse(void *p,int starting_state){
+  struct v615 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<16384;i++)
+    vp->metrics1.s[i] = (SHRT_MIN+5000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_sse(int len){
+  struct v615 *vp;
+
+  if(!Init){
+    int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+    set_viterbi615_polynomial_sse(polys);
+  }
+
+  if((vp = (struct v615 *)malloc(sizeof(struct v615))) == NULL){
+    return NULL;
+  }
+  if((vp->decisions = malloc((len+14)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  init_viterbi615_sse(vp,0);
+  return vp;
+}
+
+void set_viterbi615_polynomial_sse(int polys[6]){
+  int state;
+  int i;
+
+  for(state=0;state < 8192;state++){
+    for(i=0;i<6;i++)
+      Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi615_sse(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v615 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->decisions;  
+  endstate %= 16384;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 14; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    /*    k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;*/
+    k = (d[nbits].c[endstate/8] >> (endstate%8)) & 1;
+    endstate = (k << 13) | (endstate >> 1);
+    data[nbits>>3] = endstate >> 6;
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_sse(void *p){
+  struct v615 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi615_blk_sse(void *p,unsigned char *syms,int nbits){
+  struct v615 *vp = p;
+  decision_t *d;
+
+  if(p == NULL)
+    return -1;
+  d = (decision_t *)vp->dp;
+  while(nbits--){
+    __m64 sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+    void *tmp;
+    int i;
+
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_pi16(syms[0]);
+    sym1v = _mm_set1_pi16(syms[1]);
+    sym2v = _mm_set1_pi16(syms[2]);
+    sym3v = _mm_set1_pi16(syms[3]);
+    sym4v = _mm_set1_pi16(syms[4]);
+    sym5v = _mm_set1_pi16(syms[5]);
+    syms += 6;
+
+    for(i=0;i<2048;i++){
+      __m64 decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-1530
+       */
+      m0 = _mm_add_pi16(_mm_xor_si64(Branchtab615[0].v[i],sym0v),_mm_xor_si64(Branchtab615[1].v[i],sym1v));
+      m1 = _mm_add_pi16(_mm_xor_si64(Branchtab615[2].v[i],sym2v),_mm_xor_si64(Branchtab615[3].v[i],sym3v));
+      m2 = _mm_add_pi16(_mm_xor_si64(Branchtab615[4].v[i],sym4v),_mm_xor_si64(Branchtab615[5].v[i],sym5v));
+      metric = _mm_add_pi16(m0,_mm_add_pi16(m1,m2));
+      m_metric = _mm_sub_pi16(_mm_set1_pi16(1530),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_adds_pi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_adds_pi16(vp->old_metrics->v[2048+i],metric);
+      m1 = _mm_adds_pi16(vp->old_metrics->v[2048+i],m_metric);
+      m2 = _mm_adds_pi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      survivor0 = _mm_min_pi16(m0,m1);
+      survivor1 = _mm_min_pi16(m2,m3);
+      decision0 = _mm_cmpeq_pi16(survivor0,m1);
+      decision1 = _mm_cmpeq_pi16(survivor1,m3);
+ 
+      /* Pack decisions into 8 bits and store */
+      d->c[i] = _mm_movemask_pi8(_mm_unpacklo_pi8(_mm_packs_pi16(decision0,_mm_setzero_si64()),_mm_packs_pi16(decision1,_mm_setzero_si64())));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_pi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_pi16(survivor0,survivor1);
+    }
+    /* See if we need to renormalize
+     * Max metric spread for this code with 0-255 branch metrics is 12750
+     */
+    if(vp->new_metrics->s[0] >= SHRT_MAX-12750){
+      int i,adjust;
+      __m64 adjustv;
+      union { __m64 v; signed short w[4]; } t;
+
+      /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+      adjustv = vp->new_metrics->v[0];
+      for(i=1;i<4096;i++)
+	adjustv = _mm_min_pi16(adjustv,vp->new_metrics->v[i]);
+
+      adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,32));
+      adjustv = _mm_min_pi16(adjustv,_mm_srli_si64(adjustv,16));    
+      t.v = adjustv;
+      adjust = t.w[0] - SHRT_MIN;
+      adjustv = _mm_set1_pi16(adjust);
+      
+      for(i=0;i<4096;i++)
+	vp->new_metrics->v[i] = _mm_sub_pi16(vp->new_metrics->v[i],adjustv);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  _mm_empty();
+  return 0;
+}
diff --git a/libfec/viterbi615_sse2.c b/libfec/viterbi615_sse2.c
new file mode 100644
index 0000000..7f711e5
--- /dev/null
+++ b/libfec/viterbi615_sse2.c
@@ -0,0 +1,204 @@
+/* K=15 r=1/6 Viterbi decoder for x86 SSE2
+ * Copyright Mar 2004, Phil Karn, KA9Q
+ * May be used under the terms of the GNU Lesser General Public License (LGPL)
+ */
+#include <emmintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include "fec.h"
+
+typedef union { unsigned long w[512]; unsigned short s[1024];} decision_t;
+typedef union { signed short s[16384]; __m128i v[2048];} metric_t;
+
+static union branchtab615 { unsigned short s[8192]; __m128i v[1024];} Branchtab615[6];
+static int Init = 0;
+
+/* State info for instance of Viterbi decoder */
+struct v615 {
+  metric_t metrics1; /* path metric buffer 1 */
+  metric_t metrics2; /* path metric buffer 2 */
+  void *dp;          /* Pointer to current decision */
+  metric_t *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
+  void *decisions;   /* Beginning of decisions for block */
+};
+
+/* Initialize Viterbi decoder for start of new frame */
+int init_viterbi615_sse2(void *p,int starting_state){
+  struct v615 *vp = p;
+  int i;
+
+  if(p == NULL)
+    return -1;
+  for(i=0;i<16384;i++)
+    vp->metrics1.s[i] = (SHRT_MIN+5000);
+
+  vp->old_metrics = &vp->metrics1;
+  vp->new_metrics = &vp->metrics2;
+  vp->dp = vp->decisions;
+  vp->old_metrics->s[starting_state & 16383] = SHRT_MIN; /* Bias known start state */
+  return 0;
+}
+
+/* Create a new instance of a Viterbi decoder */
+void *create_viterbi615_sse2(int len){
+  void *p;
+  struct v615 *vp;
+
+  if(!Init){
+    int polys[6] = { V615POLYA,V615POLYB,V615POLYC,V615POLYD,V615POLYE,V615POLYF };
+    set_viterbi615_polynomial_sse2(polys);
+  }
+
+  /* Ordinary malloc() only returns 8-byte alignment, we need 16 */
+  if(posix_memalign(&p, sizeof(__m128i),sizeof(struct v615)))
+    return NULL;
+
+  vp = (struct v615 *)p;
+  if((p = malloc((len+14)*sizeof(decision_t))) == NULL){
+    free(vp);
+    return NULL;
+  }
+  vp->decisions = (decision_t *)p;
+  init_viterbi615_sse2(vp,0);
+  return vp;
+}
+
+void set_viterbi615_polynomial_sse2(int polys[6]){
+  int state;
+  int i;
+
+  for(state=0;state < 8192;state++){
+    for(i=0;i<6;i++)
+      Branchtab615[i].s[state] = (polys[i] < 0) ^ parity((2*state) & abs(polys[i])) ? 255 : 0;
+  }
+  Init++;
+}
+
+/* Viterbi chainback */
+int chainback_viterbi615_sse2(
+      void *p,
+      unsigned char *data, /* Decoded output data */
+      unsigned int nbits, /* Number of data bits */
+      unsigned int endstate){ /* Terminal encoder state */
+  struct v615 *vp = p;
+  decision_t *d = (decision_t *)vp->decisions;
+
+  endstate %= 16384;
+
+  /* The store into data[] only needs to be done every 8 bits.
+   * But this avoids a conditional branch, and the writes will
+   * combine in the cache anyway
+   */
+  d += 14; /* Look past tail */
+  while(nbits-- != 0){
+    int k;
+
+    k = (d[nbits].w[endstate/32] >> (endstate%32)) & 1;
+    endstate = (k << 13) | (endstate >> 1);
+    data[nbits>>3] = endstate >> 6;
+  }
+  return 0;
+}
+
+/* Delete instance of a Viterbi decoder */
+void delete_viterbi615_sse2(void *p){
+  struct v615 *vp = p;
+
+  if(vp != NULL){
+    free(vp->decisions);
+    free(vp);
+  }
+}
+
+
+int update_viterbi615_blk_sse2(void *p,unsigned char *syms,int nbits){
+  struct v615 *vp = p;
+  decision_t *d = (decision_t *)vp->dp;
+
+  while(nbits--){
+    __m128i sym0v,sym1v,sym2v,sym3v,sym4v,sym5v;
+    void *tmp;
+    int i;
+
+    /* Splat the 0th symbol across sym0v, the 1st symbol across sym1v, etc */
+    sym0v = _mm_set1_epi16(syms[0]);
+    sym1v = _mm_set1_epi16(syms[1]);
+    sym2v = _mm_set1_epi16(syms[2]);
+    sym3v = _mm_set1_epi16(syms[3]);
+    sym4v = _mm_set1_epi16(syms[4]);
+    sym5v = _mm_set1_epi16(syms[5]);
+    syms += 6;
+
+    /* SSE2 doesn't support saturated adds on unsigned shorts, so we have to use signed shorts */
+    for(i=0;i<1024;i++){
+      __m128i decision0,decision1,metric,m_metric,m0,m1,m2,m3,survivor0,survivor1;
+
+      /* Form branch metrics
+       * Because Branchtab takes on values 0 and 255, and the values of sym?v are offset binary in the range 0-255,
+       * the XOR operations constitute conditional negation.
+       * metric and m_metric (-metric) are in the range 0-1530
+       */
+      m0 = _mm_add_epi16(_mm_xor_si128(Branchtab615[0].v[i],sym0v),_mm_xor_si128(Branchtab615[1].v[i],sym1v));
+      m1 = _mm_add_epi16(_mm_xor_si128(Branchtab615[2].v[i],sym2v),_mm_xor_si128(Branchtab615[3].v[i],sym3v));
+      m2 = _mm_add_epi16(_mm_xor_si128(Branchtab615[4].v[i],sym4v),_mm_xor_si128(Branchtab615[5].v[i],sym5v));
+      metric = _mm_add_epi16(m0,_mm_add_epi16(m1,m2));
+      m_metric = _mm_sub_epi16(_mm_set1_epi16(1530),metric);
+    
+      /* Add branch metrics to path metrics */
+      m0 = _mm_adds_epi16(vp->old_metrics->v[i],metric);
+      m3 = _mm_adds_epi16(vp->old_metrics->v[1024+i],metric);
+      m1 = _mm_adds_epi16(vp->old_metrics->v[1024+i],m_metric);
+      m2 = _mm_adds_epi16(vp->old_metrics->v[i],m_metric);
+    
+      /* Compare and select */
+      survivor0 = _mm_min_epi16(m0,m1);
+      survivor1 = _mm_min_epi16(m2,m3);
+      decision0 = _mm_cmpeq_epi16(survivor0,m1);
+      decision1 = _mm_cmpeq_epi16(survivor1,m3);
+ 
+      /* Pack each set of decisions into 8 8-bit bytes, then interleave them and compress into 16 bits */
+      d->s[i] = _mm_movemask_epi8(_mm_unpacklo_epi8(_mm_packs_epi16(decision0,_mm_setzero_si128()),_mm_packs_epi16(decision1,_mm_setzero_si128())));
+
+      /* Store surviving metrics */
+      vp->new_metrics->v[2*i] = _mm_unpacklo_epi16(survivor0,survivor1);
+      vp->new_metrics->v[2*i+1] = _mm_unpackhi_epi16(survivor0,survivor1);
+    }
+    /* See if we need to renormalize
+     * Max metric spread for this code with 0-90 branch metrics is 405
+     */
+    if(vp->new_metrics->s[0] >= SHRT_MAX-12750){
+      int i,adjust;
+      __m128i adjustv;
+      union { __m128i v; signed short w[8]; } t;
+      
+      /* Find smallest metric and set adjustv to bring it down to SHRT_MIN */
+      adjustv = vp->new_metrics->v[0];
+      for(i=1;i<2048;i++)
+	adjustv = _mm_min_epi16(adjustv,vp->new_metrics->v[i]);
+
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,8));
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,4));
+      adjustv = _mm_min_epi16(adjustv,_mm_srli_si128(adjustv,2));
+      t.v = adjustv;
+      adjust = t.w[0] - SHRT_MIN;
+      adjustv = _mm_set1_epi16(adjust);
+
+      /* We cannot use a saturated subtract, because we often have to adjust by more than SHRT_MAX
+       * This is okay since it can't overflow anyway
+       */
+      for(i=0;i<2048;i++)
+	vp->new_metrics->v[i] = _mm_sub_epi16(vp->new_metrics->v[i],adjustv);
+    }
+    d++;
+    /* Swap pointers to old and new metrics */
+    tmp = vp->old_metrics;
+    vp->old_metrics = vp->new_metrics;
+    vp->new_metrics = tmp;
+  }
+  vp->dp = d;
+  return 0;
+}
+
+
diff --git a/libfec/vtest27.c b/libfec/vtest27.c
new file mode 100644
index 0000000..7256483
--- /dev/null
+++ b/libfec/vtest27.c
@@ -0,0 +1,184 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"frame-length",1,NULL,'l'},
+  {"frame-count",1,NULL,'n'},
+  {"ebn0",1,NULL,'e'},
+  {"gain",1,NULL,'g'},
+  {"verbose",0,NULL,'v'},
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {NULL},
+};
+#endif
+
+#define RATE (1./2.)
+#define MAXBYTES 10000
+
+double Gain = 32.0;
+int Verbose = 0;
+
+int main(int argc,char *argv[]){
+  int i,d,tr;
+  int sr=0,trials = 10000,errcnt,framebits=2048;
+  long long int tot_errs=0;
+  unsigned char bits[MAXBYTES];
+  unsigned char data[MAXBYTES];
+  unsigned char xordata[MAXBYTES];
+  unsigned char symbols[8*2*(MAXBYTES+6)];
+  void *vp;
+  extern char *optarg;
+  struct rusage start,finish;
+  double extime;
+  double gain,esn0,ebn0;
+  time_t t;
+  int badframes=0;
+
+  time(&t);
+  srandom(t);
+  ebn0 = -100;
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'l':
+      framebits = atoi(optarg);
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    case 'e':
+      ebn0 = atof(optarg);
+      break;
+    case 'g':
+      Gain = atof(optarg);
+      break;
+    case 'v':
+      Verbose++;
+      break;
+    }
+  }
+  if(framebits > 8*MAXBYTES){
+    fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+    framebits = MAXBYTES*8;
+  }
+  if((vp = create_viterbi27(framebits)) == NULL){
+    printf("create_viterbi27 failed\n");
+    exit(1);
+  }
+  if(ebn0 != -100){
+    esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+    /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+     * only half the noise power, and the sqrt() converts power to
+     * voltage.
+     */
+    gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+    
+    printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    
+    for(tr=0;tr<trials;tr++){
+      /* Encode a frame of random data */
+      for(i=0;i<framebits+6;i++){
+	int bit = (i < framebits) ? (random() & 1) : 0;
+	
+	sr = (sr << 1) | bit;
+	bits[i/8] = sr & 0xff;
+	symbols[2*i+0] = addnoise(parity(sr & V27POLYA),gain,Gain,127.5,255);
+	symbols[2*i+1] = addnoise(parity(sr & V27POLYB),gain,Gain,127.5,255);
+      }
+      /* Decode it and make sure we get the right answer */
+      /* Initialize Viterbi decoder */
+      init_viterbi27(vp,0);
+      
+      /* Decode block */
+      update_viterbi27_blk(vp,symbols,framebits+6);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi27(vp,data,framebits,0);
+      errcnt = 0;
+      for(i=0;i<framebits/8;i++){
+	int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+	errcnt += e;
+	tot_errs += e;
+      }
+      if(errcnt != 0)
+	badframes++;
+      if(Verbose > 1 && errcnt != 0){
+	printf("frame %d, %d errors: ",tr,errcnt);
+	for(i=0;i<framebits/8;i++){
+	  printf("%02x",xordata[i]);
+	}
+	printf("\n");
+      }
+      if(Verbose)
+	printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r",
+	       tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+	       badframes,tr+1,(double)badframes/(tr+1));
+      fflush(stdout);
+    }
+    if(Verbose > 1)
+      printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    else if(Verbose == 0)
+      printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n",
+	     tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials),
+	     badframes,tr+1,(double)badframes/(tr+1));
+    else
+      printf("\n");
+
+  } else {
+    /* Do time trials */
+    memset(symbols,127,sizeof(symbols));
+    printf("Starting time trials\n");
+    getrusage(RUSAGE_SELF,&start);
+    for(tr=0;tr < trials;tr++){
+      /* Initialize Viterbi decoder */
+      init_viterbi27(vp,0);
+      
+      /* Decode block */
+      update_viterbi27_blk(vp,symbols,framebits);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi27(vp,data,framebits,0);
+    }
+    getrusage(RUSAGE_SELF,&finish);
+    extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+    printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+	   framebits,extime);
+    printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+  }
+  exit(0);
+}
diff --git a/libfec/vtest29.c b/libfec/vtest29.c
new file mode 100644
index 0000000..8471b54
--- /dev/null
+++ b/libfec/vtest29.c
@@ -0,0 +1,185 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"frame-length",1,NULL,'l'},
+  {"frame-count",1,NULL,'n'},
+  {"ebn0",1,NULL,'e'},
+  {"gain",1,NULL,'g'},
+  {"verbose",0,NULL,'v'},
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {NULL},
+};
+#endif
+
+#define RATE (1./2.)
+#define MAXBYTES 10000
+
+double Gain = 32.0;
+int Verbose = 0;
+
+int main(int argc,char *argv[]){
+  int i,d,tr;
+  int sr=0,trials = 10000,errcnt,framebits=2048;
+  long long tot_errs=0;
+  unsigned char bits[MAXBYTES];
+  unsigned char data[MAXBYTES];
+  unsigned char xordata[MAXBYTES];
+  unsigned char symbols[8*2*(MAXBYTES+8)];
+  void *vp;
+  extern char *optarg;
+  struct rusage start,finish;
+  double extime;
+  double gain,esn0,ebn0;
+  time_t t;
+  int badframes=0;
+
+  time(&t);
+  srandom(t);
+  ebn0 = -100;
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'l':
+      framebits = atoi(optarg);
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    case 'e':
+      ebn0 = atof(optarg);
+      break;
+    case 'g':
+      Gain = atof(optarg);
+      break;
+    case 'v':
+      Verbose++;
+      break;
+    }
+  }
+  if(framebits > 8*MAXBYTES){
+    fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+    framebits = MAXBYTES*8;
+  }
+  if((vp = create_viterbi29(framebits)) == NULL){
+    printf("create_viterbi29 failed\n");
+    exit(1);
+  }
+  if(ebn0 != -100){
+    esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+    /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+     * only half the noise power, and the sqrt() converts power to
+     * voltage.
+     */
+    gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+    
+    printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    
+    for(tr=0;tr<trials;tr++){
+      /* Encode a frame of random data */
+      for(i=0;i<framebits+8;i++){
+	int bit = (i < framebits) ? (random() & 1) : 0;
+	
+	sr = (sr << 1) | bit;
+	bits[i/8] = sr & 0xff;
+	symbols[2*i+0] = addnoise(parity(sr & V29POLYA),gain,Gain,127.5,255);
+	symbols[2*i+1] = addnoise(parity(sr & V29POLYB),gain,Gain,127.5,255);
+      }
+      /* Decode it and make sure we get the right answer */
+      /* Initialize Viterbi decoder */
+      init_viterbi29(vp,0);
+      
+      /* Decode block */
+      update_viterbi29_blk(vp,symbols,framebits+8);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi29(vp,data,framebits,0);
+      errcnt = 0;
+      for(i=0;i<framebits/8;i++){
+	int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+	errcnt += e;
+	tot_errs += e;
+      }
+      if(errcnt != 0)
+	badframes++;
+      if(Verbose > 1 && errcnt != 0){
+	printf("frame %d, %d errors: ",tr,errcnt);
+	for(i=0;i<framebits/8;i++){
+	  printf("%02x",xordata[i]);
+	}
+	printf("\n");
+      }
+      if(Verbose)
+	printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r",
+	       tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+	       badframes,tr+1,(double)badframes/(tr+1));
+      fflush(stdout);
+    }
+    if(Verbose > 1)
+      printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    else if(Verbose == 0)
+      printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n",
+	     tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials),
+	     badframes,tr+1,(double)badframes/(tr+1));
+    else
+      printf("\n");
+  } else {
+    /* Do time trials */
+    memset(symbols,127,sizeof(symbols));
+    printf("Starting time trials\n");
+    getrusage(RUSAGE_SELF,&start);
+    for(tr=0;tr < trials;tr++){
+      /* Initialize Viterbi decoder */
+      init_viterbi29(vp,0);
+      
+      /* Decode block */
+      update_viterbi29_blk(vp,symbols,framebits);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi29(vp,data,framebits,0);
+    }
+    getrusage(RUSAGE_SELF,&finish);
+    extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+    printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+	   framebits,extime);
+    printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+  }
+  exit(0);
+}
+
+
diff --git a/libfec/vtest39.c b/libfec/vtest39.c
new file mode 100644
index 0000000..76723b2
--- /dev/null
+++ b/libfec/vtest39.c
@@ -0,0 +1,186 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"frame-length",1,NULL,'l'},
+  {"frame-count",1,NULL,'n'},
+  {"ebn0",1,NULL,'e'},
+  {"gain",1,NULL,'g'},
+  {"verbose",0,NULL,'v'},
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {NULL},
+};
+#endif
+
+#define RATE (1./3.)
+#define MAXBYTES 10000
+
+double Gain = 32.0;
+int Verbose = 0;
+
+int main(int argc,char *argv[]){
+  int i,d,tr;
+  int sr=0,trials = 10000,errcnt,framebits=2048;
+  long long tot_errs=0;
+  unsigned char bits[MAXBYTES];
+  unsigned char data[MAXBYTES];
+  unsigned char xordata[MAXBYTES];
+  unsigned char symbols[8*3*(MAXBYTES+8)];
+  void *vp;
+  extern char *optarg;
+  struct rusage start,finish;
+  double extime;
+  double gain,esn0,ebn0;
+  time_t t;
+  int badframes=0;
+
+  time(&t);
+  srandom(t);
+  ebn0 = -100;
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'l':
+      framebits = atoi(optarg);
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    case 'e':
+      ebn0 = atof(optarg);
+      break;
+    case 'g':
+      Gain = atof(optarg);
+      break;
+    case 'v':
+      Verbose++;
+      break;
+    }
+  }
+  if(framebits > 8*MAXBYTES){
+    fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+    framebits = MAXBYTES*8;
+  }
+  if((vp = create_viterbi39(framebits)) == NULL){
+    printf("create_viterbi39 failed\n");
+    exit(1);
+  }
+  if(ebn0 != -100){
+    esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+    /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+     * only half the noise power, and the sqrt() converts power to
+     * voltage.
+     */
+    gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+    
+    printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    
+    for(tr=0;tr<trials;tr++){
+      /* Encode a frame of random data */
+      for(i=0;i<framebits+8;i++){
+	int bit = (i < framebits) ? (random() & 1) : 0;
+	
+	sr = (sr << 1) | bit;
+	bits[i/8] = sr & 0xff;
+	symbols[3*i+0] = addnoise(parity(sr & V39POLYA),gain,Gain,127.5,255);
+	symbols[3*i+1] = addnoise(parity(sr & V39POLYB),gain,Gain,127.5,255);
+	symbols[3*i+2] = addnoise(parity(sr & V39POLYC),gain,Gain,127.5,255);
+      }
+      /* Decode it and make sure we get the right answer */
+      /* Initialize Viterbi decoder */
+      init_viterbi39(vp,0);
+      
+      /* Decode block */
+      update_viterbi39_blk(vp,symbols,framebits+8);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi39(vp,data,framebits,0);
+      errcnt = 0;
+      for(i=0;i<framebits/8;i++){
+	int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+	errcnt += e;
+	tot_errs += e;
+      }
+      if(errcnt != 0)
+	badframes++;
+      if(Verbose > 1 && errcnt != 0){
+	printf("frame %d, %d errors: ",tr,errcnt);
+	for(i=0;i<framebits/8;i++){
+	  printf("%02x",xordata[i]);
+	}
+	printf("\n");
+      }
+      if(Verbose)
+	printf("BER %lld/%lld (%10.3g) FER %d/%d (%10.3g)\r",
+	       tot_errs,(long long)framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+	       badframes,tr+1,(double)badframes/(tr+1));
+      fflush(stdout);
+    }
+    if(Verbose > 1)
+      printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    else if(Verbose == 0)
+      printf("BER %lld/%lld (%.3g) FER %d/%d (%.3g)\n",
+	     tot_errs,(long long)framebits*trials,tot_errs/((double)framebits*trials),
+	     badframes,tr+1,(double)badframes/(tr+1));
+    else
+      printf("\n");
+  } else {
+    /* Do time trials */
+    memset(symbols,127,sizeof(symbols));
+    printf("Starting time trials\n");
+    getrusage(RUSAGE_SELF,&start);
+    for(tr=0;tr < trials;tr++){
+      /* Initialize Viterbi decoder */
+      init_viterbi39(vp,0);
+      
+      /* Decode block */
+      update_viterbi39_blk(vp,symbols,framebits);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi39(vp,data,framebits,0);
+    }
+    getrusage(RUSAGE_SELF,&finish);
+    extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+    printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+	   framebits,extime);
+    printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+  }
+  exit(0);
+}
+
+
diff --git a/libfec/vtest615.c b/libfec/vtest615.c
new file mode 100644
index 0000000..4bd8c4f
--- /dev/null
+++ b/libfec/vtest615.c
@@ -0,0 +1,191 @@
+/* Test viterbi decoder speeds */
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+#include "fec.h"
+
+#if HAVE_GETOPT_LONG
+struct option Options[] = {
+  {"frame-length",1,NULL,'l'},
+  {"frame-count",1,NULL,'n'},
+  {"ebn0",1,NULL,'e'},
+  {"gain",1,NULL,'g'},
+  {"verbose",0,NULL,'v'},
+  {"force-altivec",0,NULL,'a'},
+  {"force-port",0,NULL,'p'},
+  {"force-mmx",0,NULL,'m'},
+  {"force-sse",0,NULL,'s'},
+  {"force-sse2",0,NULL,'t'},
+  {NULL},
+};
+#endif
+
+#define RATE (1./6.)
+#define MAXBYTES 10000
+#define OFFSET (127.5)
+#define CLIP 255
+
+double Gain = 24.0;
+int Verbose = 0;
+
+int main(int argc,char *argv[]){
+  int i,d,tr;
+  int sr=0,trials = 10,errcnt,framebits=2048;
+  int tot_errs=0;
+  unsigned char bits[MAXBYTES];
+  unsigned char data[MAXBYTES];
+  unsigned char xordata[MAXBYTES];
+  unsigned char symbols[8*6*(MAXBYTES+14)];
+  void *vp;
+  extern char *optarg;
+  struct rusage start,finish;
+  double extime;
+  double gain,esn0,ebn0;
+  time_t t;
+  int badframes=0;
+
+  time(&t);
+  srandom(t);
+  ebn0 = -100;
+#if HAVE_GETOPT_LONG
+  while((d = getopt_long(argc,argv,"l:n:te:g:vapmst",Options,NULL)) != EOF){
+#else
+  while((d = getopt(argc,argv,"l:n:te:g:vapmst")) != EOF){
+#endif
+    switch(d){
+    case 'a':
+      Cpu_mode = ALTIVEC;
+      break;
+    case 'p':
+      Cpu_mode = PORT;
+      break;
+    case 'm':
+      Cpu_mode = MMX;
+      break;
+    case 's':
+      Cpu_mode = SSE;
+      break;
+    case 't':
+      Cpu_mode = SSE2;
+      break;
+    case 'l':
+      framebits = atoi(optarg);
+      break;
+    case 'n':
+      trials = atoi(optarg);
+      break;
+    case 'e':
+      ebn0 = atof(optarg);
+      break;
+    case 'g':
+      Gain = atof(optarg);
+      break;
+    case 'v':
+      Verbose++;
+      break;
+    }
+  }
+  if(framebits > 8*MAXBYTES){
+    fprintf(stderr,"Frame limited to %d bits\n",MAXBYTES*8);
+    framebits = MAXBYTES*8;
+  }
+  if((vp = create_viterbi615(framebits)) == NULL){
+    printf("create_viterbi615 failed\n");
+    exit(1);
+  }
+  if(ebn0 != -100){
+    esn0 = ebn0 + 10*log10((double)RATE); /* Es/No in dB */
+    /* Compute noise voltage. The 0.5 factor accounts for BPSK seeing
+     * only half the noise power, and the sqrt() converts power to
+     * voltage.
+     */
+    gain = 1./sqrt(0.5/pow(10.,esn0/10.));
+    
+    printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    
+    for(tr=0;tr<trials;tr++){
+      /* Encode a frame of random data */
+      for(i=0;i<framebits+14;i++){
+	int bit = (i < framebits) ? (random() & 1) : 0;
+	
+	sr = (sr << 1) | bit;
+	bits[i/8] = sr & 0xff;
+	symbols[6*i+0] = addnoise(parity(sr & V615POLYA),gain,Gain,OFFSET,CLIP);
+	symbols[6*i+1] = addnoise(parity(sr & V615POLYB),gain,Gain,OFFSET,CLIP);
+	symbols[6*i+2] = addnoise(parity(sr & V615POLYC),gain,Gain,OFFSET,CLIP);
+	symbols[6*i+3] = addnoise(parity(sr & V615POLYD),gain,Gain,OFFSET,CLIP);
+	symbols[6*i+4] = addnoise(parity(sr & V615POLYE),gain,Gain,OFFSET,CLIP);
+	symbols[6*i+5] = addnoise(parity(sr & V615POLYF),gain,Gain,OFFSET,CLIP);
+      }
+      /* Decode it and make sure we get the right answer */
+      /* Initialize Viterbi decoder */
+      init_viterbi615(vp,0);
+      
+      /* Decode block */
+      update_viterbi615_blk(vp,symbols,framebits+14);
+      
+      /* Do Viterbi chainback */
+      chainback_viterbi615(vp,data,framebits,0);
+      errcnt = 0;
+      for(i=0;i<framebits/8;i++){
+	int e = Bitcnt[xordata[i] = data[i] ^ bits[i]];
+	errcnt += e;
+	tot_errs += e;
+      }
+      if(errcnt != 0)
+	badframes++;
+      if(Verbose > 1 && errcnt != 0){
+	printf("frame %d, %d errors: ",tr,errcnt);
+	for(i=0;i<framebits/8;i++){
+	  printf("%02x",xordata[i]);
+	}
+	printf("\n");
+      }
+      if(Verbose)
+	printf("BER %d/%d (%10.3g) FER %d/%d (%10.3g)\r",
+	       tot_errs,framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+	       badframes,(tr+1),(double)badframes/(tr+1));
+      fflush(stdout);
+
+    }
+
+    if(Verbose > 1)
+      printf("nframes = %d framesize = %d ebn0 = %.2f dB gain = %g\n",trials,framebits,ebn0,Gain);
+    else if(Verbose == 0)
+	printf("BER %d/%d (%.3g) FER %d/%d (%.3g)\n",
+	       tot_errs,framebits*(tr+1),tot_errs/((double)framebits*(tr+1)),
+	       badframes,(tr+1),(double)badframes/(tr+1));
+    else
+      printf("\n");
+  } else {
+    /* Do time trials */
+    memset(symbols,127,sizeof(symbols));
+    printf("Starting time trials\n");
+    getrusage(RUSAGE_SELF,&start);
+    for(tr=0;tr < trials;tr++){
+      /* Initialize Viterbi decoder */
+      init_viterbi615(vp,0);
+
+      /* Decode block */
+      update_viterbi615_blk(vp,symbols,framebits+14);
+
+      /* Do Viterbi chainback */
+      chainback_viterbi615(vp,data,framebits,0);
+    }
+    getrusage(RUSAGE_SELF,&finish);
+    extime = finish.ru_utime.tv_sec - start.ru_utime.tv_sec + 1e-6*(finish.ru_utime.tv_usec - start.ru_utime.tv_usec);
+    printf("Execution time for %d %d-bit frames: %.2f sec\n",trials,
+	   framebits,extime);
+    printf("decoder speed: %g bits/s\n",trials*framebits/extime);
+  }
+  exit(0);
+}