[149853] trunk/dports/science/volk
michaelld at macports.org
michaelld at macports.org
Mon Jul 4 08:24:16 PDT 2016
Revision: 149853
https://trac.macports.org/changeset/149853
Author: michaelld at macports.org
Date: 2016-07-04 08:24:16 -0700 (Mon, 04 Jul 2016)
Log Message:
-----------
volk: update release to 1.3 and devel to 18428fb9 (20160702), removing integrated patch.
Modified Paths:
--------------
trunk/dports/science/volk/Portfile
Removed Paths:
-------------
trunk/dports/science/volk/files/patch-update_1.2.2_to_current.diff
Modified: trunk/dports/science/volk/Portfile
===================================================================
--- trunk/dports/science/volk/Portfile 2016-07-04 10:01:38 UTC (rev 149852)
+++ trunk/dports/science/volk/Portfile 2016-07-04 15:24:16 UTC (rev 149853)
@@ -19,11 +19,10 @@
if {${name} eq ${subport}} {
- github.setup gnuradio volk 1.2.2 v
- revision 1
+ github.setup gnuradio volk 1.3 v
checksums \
- rmd160 ba5e73686ea80a479bfa0dd545b71b3e3b85a9d9 \
- sha256 f7f186205e05dc62448cd138a38cc4434142e8dd7c3eb914b79689b3055f1152
+ rmd160 5f1baa8dfc8ac948e4223e4f14f0ff328afffc9f \
+ sha256 fc112c38e898b1ad1bba79debf0769fecc555143bd60e33ea571c1e9908c320e
# bump the epoch because I moved the version from 20150707 to 1.0.1
epoch 1
@@ -31,21 +30,15 @@
provides the release version, which is typically updated every month or so.
conflicts volk-devel
- # temporary patchfile to correct API for volk_32f_index_max_16u to
- # be the same as that provided by volk-devel, and thus correct
- # with respect to usage by other ports.
-
- patchfiles-append patch-update_1.2.2_to_current.diff
-
}
subport volk-devel {
- github.setup gnuradio volk 96112746c8a8be15c0c4c968e3cfa0e8c5d3d713
- version 20160623
+ github.setup gnuradio volk 18428fb9f718f5f7fa34707dd47ab6db07d88683
+ version 20160702
checksums \
- rmd160 f17350be3f5b6feff0e7b0e4aa08836ae0775c68 \
- sha256 efd6998229c68aaa985f695931f8d35b62f5120ea58a0556aa6075d1ab035a8f
+ rmd160 615489762fc28dae2deef5d38c24004fae7f73f9 \
+ sha256 b6b40d98e96ded5dfa4441c69a1a656fcbe3c6a8f4beb8323e23c2224605afa8
conflicts volk
long_description ${long_description} ${subport} \
Deleted: trunk/dports/science/volk/files/patch-update_1.2.2_to_current.diff
===================================================================
--- trunk/dports/science/volk/files/patch-update_1.2.2_to_current.diff 2016-07-04 10:01:38 UTC (rev 149852)
+++ trunk/dports/science/volk/files/patch-update_1.2.2_to_current.diff 2016-07-04 15:24:16 UTC (rev 149853)
@@ -1,1295 +0,0 @@
---- CMakeLists.txt.orig
-+++ CMakeLists.txt
-@@ -215,6 +215,11 @@ endif()
- ########################################################################
-
- configure_file(
-+ ${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in
-+ ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfig.cmake
-+ at ONLY)
-+
-+configure_file(
- ${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in
- ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake
- @ONLY)
-@@ -230,7 +235,7 @@ endif(NOT CMAKE_MODULES_DIR)
-
- install(
- FILES
-- ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake
-+ ${CMAKE_CURRENT_BINARY_DIR}/cmake/Modules/VolkConfig.cmake
- ${CMAKE_CURRENT_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake
- DESTINATION ${CMAKE_MODULES_DIR}/volk
- COMPONENT "volk_devel"
---- apps/volk-config-info.cc.orig
-+++ apps/volk-config-info.cc
-@@ -1,6 +1,6 @@
- /* -*- c++ -*- */
- /*
-- * Copyright 2013 Free Software Foundation, Inc.
-+ * Copyright 2013, 2016 Free Software Foundation, Inc.
- *
- * This file is part of GNU Radio
- *
-@@ -45,6 +45,8 @@ main(int argc, char **argv)
- ("all-machines", "print VOLK machines built into library")
- ("avail-machines", "print VOLK machines the current platform can use")
- ("machine", "print the VOLK machine that will be used")
-+ ("alignment", "print the alignment that will be used")
-+ ("malloc", "print malloc implementation that will be used")
- ("version,v", "print VOLK version")
- ;
-
-@@ -88,5 +90,22 @@ main(int argc, char **argv)
- std::cout << volk_get_machine() << std::endl;
- }
-
-+ if(vm.count("alignment")) {
-+ std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl;
-+ }
-+
-+ // You don't want to change the volk_malloc code, so just copy the if/else
-+ // structure from there and give an explanation for the implementations
-+ if(vm.count("malloc")) {
-+ std::cout << "Used malloc implementation: ";
-+#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
-+ std::cout << "posix_memalign" << std::endl;
-+#elif _MSC_VER >= 1400
-+ std::cout << "aligned_malloc" << std::endl;
-+#else
-+ std::cout << "No standard handler available, using own implementation." << std::endl;
-+#endif
-+ }
-+
- return 0;
- }
---- cmake/Modules/VolkConfig.cmake
-+++ /dev/null
-@@ -1,26 +0,0 @@
--INCLUDE(FindPkgConfig)
--PKG_CHECK_MODULES(PC_VOLK volk)
--
--FIND_PATH(
-- VOLK_INCLUDE_DIRS
-- NAMES volk/volk.h
-- HINTS $ENV{VOLK_DIR}/include
-- ${PC_VOLK_INCLUDEDIR}
-- PATHS /usr/local/include
-- /usr/include
--)
--
--FIND_LIBRARY(
-- VOLK_LIBRARIES
-- NAMES volk
-- HINTS $ENV{VOLK_DIR}/lib
-- ${PC_VOLK_LIBDIR}
-- PATHS /usr/local/lib
-- /usr/local/lib64
-- /usr/lib
-- /usr/lib64
--)
--
--INCLUDE(FindPackageHandleStandardArgs)
--FIND_PACKAGE_HANDLE_STANDARD_ARGS(VOLK DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
--MARK_AS_ADVANCED(VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
---- /dev/null
-+++ cmake/Modules/VolkConfig.cmake.in
-@@ -0,0 +1,28 @@
-+INCLUDE(FindPkgConfig)
-+PKG_CHECK_MODULES(PC_VOLK volk)
-+
-+FIND_PATH(
-+ VOLK_INCLUDE_DIRS
-+ NAMES volk/volk.h
-+ HINTS $ENV{VOLK_DIR}/include
-+ ${PC_VOLK_INCLUDEDIR}
-+ PATHS /usr/local/include
-+ /usr/include
-+ "@CMAKE_INSTALL_PREFIX@/include"
-+)
-+
-+FIND_LIBRARY(
-+ VOLK_LIBRARIES
-+ NAMES volk
-+ HINTS $ENV{VOLK_DIR}/lib
-+ ${PC_VOLK_LIBDIR}
-+ PATHS /usr/local/lib
-+ /usr/local/lib64
-+ /usr/lib
-+ /usr/lib64
-+ "@CMAKE_INSTALL_PREFIX@/lib"
-+)
-+
-+INCLUDE(FindPackageHandleStandardArgs)
-+FIND_PACKAGE_HANDLE_STANDARD_ARGS(VOLK DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
-+MARK_AS_ADVANCED(VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
---- docs/kernels.dox.orig
-+++ docs/kernels.dox
-@@ -47,6 +47,7 @@
- \li \subpage volk_32fc_deinterleave_real_32f
- \li \subpage volk_32fc_deinterleave_real_64f
- \li \subpage volk_32fc_index_max_16u
-+\li \subpage volk_32fc_index_max_32u
- \li \subpage volk_32fc_magnitude_32f
- \li \subpage volk_32fc_magnitude_squared_32f
- \li \subpage volk_32f_cos_32f
-@@ -61,6 +62,7 @@
- \li \subpage volk_32fc_x2_square_dist_32f
- \li \subpage volk_32f_expfast_32f
- \li \subpage volk_32f_index_max_16u
-+\li \subpage volk_32f_index_max_32u
- \li \subpage volk_32f_invsqrt_32f
- \li \subpage volk_32f_log2_32f
- \li \subpage volk_32f_s32f_calc_spectral_noise_floor_32f
---- kernels/volk/volk_32f_index_max_16u.h.orig
-+++ kernels/volk/volk_32f_index_max_16u.h
-@@ -25,11 +25,18 @@
- *
- * \b Overview
- *
-- * Returns Argmax_i x[i]. Finds and returns the index which contains the maximum value in the given vector.
-+ * Returns Argmax_i x[i]. Finds and returns the index which contains
-+ * the maximum value in the given vector.
-+ *
-+ * Note that num_points is a uint32_t, but the return value is
-+ * uint16_t. Providing a vector larger than the max of a uint16_t
-+ * (65536) would miss anything outside of this boundary. The kernel
-+ * will check the length of num_points and cap it to this max value,
-+ * anyways.
- *
- * <b>Dispatcher Prototype</b>
- * \code
-- * void volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points)
-+ * void volk_32f_index_max_16u(uint16_t* target, const float* src0, uint32_t num_points)
- * \endcode
- *
- * \b Inputs
-@@ -42,11 +49,11 @@
- * \b Example
- * \code
- * int N = 10;
-- * unsigned int alignment = volk_get_alignment();
-+ * uint32_t alignment = volk_get_alignment();
- * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
-- * uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
-+ * uint16_t* out = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
- *
-- * for(unsigned int ii = 0; ii < N; ++ii){
-+ * for(uint32_t ii = 0; ii < N; ++ii){
- * float x = (float)ii;
- * // a parabola with a maximum at x=4
- * in[ii] = -(x-4) * (x-4) + 5;
-@@ -67,64 +74,66 @@
- #include <volk/volk_common.h>
- #include <volk/volk_common.h>
- #include <inttypes.h>
-+#include <limits.h>
- #include <stdio.h>
-
- #ifdef LV_HAVE_SSE4_1
--#include<smmintrin.h>
-+#include <smmintrin.h>
-
- static inline void
--volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points)
-+volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0,
-+ uint32_t num_points)
- {
-- if(num_points > 0){
-- unsigned int number = 0;
-- const unsigned int quarterPoints = num_points / 4;
-+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
-- float* inputPtr = (float*)src0;
-+ uint32_t number = 0;
-+ const uint32_t quarterPoints = num_points / 4;
-
-- __m128 indexIncrementValues = _mm_set1_ps(4);
-- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-+ float* inputPtr = (float*)src0;
-
-- float max = src0[0];
-- float index = 0;
-- __m128 maxValues = _mm_set1_ps(max);
-- __m128 maxValuesIndex = _mm_setzero_ps();
-- __m128 compareResults;
-- __m128 currentValues;
-+ __m128 indexIncrementValues = _mm_set1_ps(4);
-+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-
-- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-+ float max = src0[0];
-+ float index = 0;
-+ __m128 maxValues = _mm_set1_ps(max);
-+ __m128 maxValuesIndex = _mm_setzero_ps();
-+ __m128 compareResults;
-+ __m128 currentValues;
-
-- for(;number < quarterPoints; number++){
-+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-
-- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
-- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
-+ for(;number < quarterPoints; number++){
-
-- compareResults = _mm_cmpgt_ps(maxValues, currentValues);
-+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
-+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
-
-- maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
-- maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
-- }
-+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
-
-- // Calculate the largest value from the remaining 4 points
-- _mm_store_ps(maxValuesBuffer, maxValues);
-- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-+ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
-+ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
-+ }
-
-- for(number = 0; number < 4; number++){
-- if(maxValuesBuffer[number] > max){
-- index = maxIndexesBuffer[number];
-- max = maxValuesBuffer[number];
-- }
-+ // Calculate the largest value from the remaining 4 points
-+ _mm_store_ps(maxValuesBuffer, maxValues);
-+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-+
-+ for(number = 0; number < 4; number++){
-+ if(maxValuesBuffer[number] > max){
-+ index = maxIndexesBuffer[number];
-+ max = maxValuesBuffer[number];
- }
-+ }
-
-- number = quarterPoints * 4;
-- for(;number < num_points; number++){
-- if(src0[number] > max){
-- index = number;
-- max = src0[number];
-- }
-+ number = quarterPoints * 4;
-+ for(;number < num_points; number++){
-+ if(src0[number] > max){
-+ index = number;
-+ max = src0[number];
- }
-- target[0] = (unsigned int)index;
- }
-+ target[0] = (uint16_t)index;
- }
-
- #endif /*LV_HAVE_SSE4_1*/
-@@ -132,62 +141,63 @@ volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigne
-
- #ifdef LV_HAVE_SSE
-
--#include<xmmintrin.h>
-+#include <xmmintrin.h>
-
- static inline void
--volk_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points)
-+volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0,
-+ uint32_t num_points)
- {
-- if(num_points > 0){
-- unsigned int number = 0;
-- const unsigned int quarterPoints = num_points / 4;
-+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-
-- float* inputPtr = (float*)src0;
-+ uint32_t number = 0;
-+ const uint32_t quarterPoints = num_points / 4;
-
-- __m128 indexIncrementValues = _mm_set1_ps(4);
-- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-+ float* inputPtr = (float*)src0;
-
-- float max = src0[0];
-- float index = 0;
-- __m128 maxValues = _mm_set1_ps(max);
-- __m128 maxValuesIndex = _mm_setzero_ps();
-- __m128 compareResults;
-- __m128 currentValues;
-+ __m128 indexIncrementValues = _mm_set1_ps(4);
-+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-
-- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-+ float max = src0[0];
-+ float index = 0;
-+ __m128 maxValues = _mm_set1_ps(max);
-+ __m128 maxValuesIndex = _mm_setzero_ps();
-+ __m128 compareResults;
-+ __m128 currentValues;
-
-- for(;number < quarterPoints; number++){
-+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-
-- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
-- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
-+ for(;number < quarterPoints; number++){
-
-- compareResults = _mm_cmpgt_ps(maxValues, currentValues);
-+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
-+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
-
-- maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
-+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
-
-- maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
-- }
-+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
-+
-+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
-+ }
-
-- // Calculate the largest value from the remaining 4 points
-- _mm_store_ps(maxValuesBuffer, maxValues);
-- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-+ // Calculate the largest value from the remaining 4 points
-+ _mm_store_ps(maxValuesBuffer, maxValues);
-+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-
-- for(number = 0; number < 4; number++){
-- if(maxValuesBuffer[number] > max){
-- index = maxIndexesBuffer[number];
-- max = maxValuesBuffer[number];
-- }
-+ for(number = 0; number < 4; number++){
-+ if(maxValuesBuffer[number] > max){
-+ index = maxIndexesBuffer[number];
-+ max = maxValuesBuffer[number];
- }
-+ }
-
-- number = quarterPoints * 4;
-- for(;number < num_points; number++){
-- if(src0[number] > max){
-- index = number;
-- max = src0[number];
-- }
-+ number = quarterPoints * 4;
-+ for(;number < num_points; number++){
-+ if(src0[number] > max){
-+ index = number;
-+ max = src0[number];
- }
-- target[0] = (unsigned int)index;
- }
-+ target[0] = (uint16_t)index;
- }
-
- #endif /*LV_HAVE_SSE*/
-@@ -196,22 +206,23 @@ volk_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned i
- #ifdef LV_HAVE_GENERIC
-
- static inline void
--volk_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points)
-+volk_32f_index_max_16u_generic(uint16_t* target, const float* src0,
-+ uint32_t num_points)
- {
-- if(num_points > 0){
-- float max = src0[0];
-- unsigned int index = 0;
-+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-+
-+ float max = src0[0];
-+ uint16_t index = 0;
-
-- unsigned int i = 1;
-+ uint32_t i = 1;
-
-- for(; i < num_points; ++i) {
-- if(src0[i] > max){
-- index = i;
-- max = src0[i];
-- }
-+ for(; i < num_points; ++i) {
-+ if(src0[i] > max) {
-+ index = i;
-+ max = src0[i];
- }
-- target[0] = index;
- }
-+ target[0] = index;
- }
-
- #endif /*LV_HAVE_GENERIC*/
---- /dev/null
-+++ kernels/volk/volk_32f_index_max_32u.h
-@@ -0,0 +1,220 @@
-+/* -*- c++ -*- */
-+/*
-+ * Copyright 2016 Free Software Foundation, Inc.
-+ *
-+ * This file is part of GNU Radio
-+ *
-+ * GNU Radio is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 3, or (at your option)
-+ * any later version.
-+ *
-+ * GNU Radio is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-+ * GNU General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU General Public License
-+ * along with GNU Radio; see the file COPYING. If not, write to
-+ * the Free Software Foundation, Inc., 51 Franklin Street,
-+ * Boston, MA 02110-1301, USA.
-+ */
-+
-+/*!
-+ * \page volk_32f_index_max_32u
-+ *
-+ * \b Overview
-+ *
-+ * Returns Argmax_i x[i]. Finds and returns the index which contains the maximum value in the given vector.
-+ *
-+ * <b>Dispatcher Prototype</b>
-+ * \code
-+ * void volk_32f_index_max_32u(uint32_t* target, const float* src0, uint32_t num_points)
-+ * \endcode
-+ *
-+ * \b Inputs
-+ * \li src0: The input vector of floats.
-+ * \li num_points: The number of data points.
-+ *
-+ * \b Outputs
-+ * \li target: The index of the maximum value in the input buffer.
-+ *
-+ * \b Example
-+ * \code
-+ * int N = 10;
-+ * uint32_t alignment = volk_get_alignment();
-+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
-+ * uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
-+ *
-+ * for(uint32_t ii = 0; ii < N; ++ii){
-+ * float x = (float)ii;
-+ * // a parabola with a maximum at x=4
-+ * in[ii] = -(x-4) * (x-4) + 5;
-+ * }
-+ *
-+ * volk_32f_index_max_32u(out, in, N);
-+ *
-+ * printf("maximum is %1.2f at index %u\n", in[*out], *out);
-+ *
-+ * volk_free(in);
-+ * volk_free(out);
-+ * \endcode
-+ */
-+
-+#ifndef INCLUDED_volk_32f_index_max_32u_a_H
-+#define INCLUDED_volk_32f_index_max_32u_a_H
-+
-+#include <volk/volk_common.h>
-+#include <volk/volk_common.h>
-+#include <inttypes.h>
-+#include <stdio.h>
-+
-+#ifdef LV_HAVE_SSE4_1
-+#include<smmintrin.h>
-+
-+static inline void
-+volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
-+{
-+ if(num_points > 0){
-+ uint32_t number = 0;
-+ const uint32_t quarterPoints = num_points / 4;
-+
-+ float* inputPtr = (float*)src0;
-+
-+ __m128 indexIncrementValues = _mm_set1_ps(4);
-+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-+
-+ float max = src0[0];
-+ float index = 0;
-+ __m128 maxValues = _mm_set1_ps(max);
-+ __m128 maxValuesIndex = _mm_setzero_ps();
-+ __m128 compareResults;
-+ __m128 currentValues;
-+
-+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-+
-+ for(;number < quarterPoints; number++){
-+
-+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
-+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
-+
-+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
-+
-+ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
-+ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
-+ }
-+
-+ // Calculate the largest value from the remaining 4 points
-+ _mm_store_ps(maxValuesBuffer, maxValues);
-+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-+
-+ for(number = 0; number < 4; number++){
-+ if(maxValuesBuffer[number] > max){
-+ index = maxIndexesBuffer[number];
-+ max = maxValuesBuffer[number];
-+ }
-+ }
-+
-+ number = quarterPoints * 4;
-+ for(;number < num_points; number++){
-+ if(src0[number] > max){
-+ index = number;
-+ max = src0[number];
-+ }
-+ }
-+ target[0] = (uint32_t)index;
-+ }
-+}
-+
-+#endif /*LV_HAVE_SSE4_1*/
-+
-+
-+#ifdef LV_HAVE_SSE
-+
-+#include<xmmintrin.h>
-+
-+static inline void
-+volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
-+{
-+ if(num_points > 0){
-+ uint32_t number = 0;
-+ const uint32_t quarterPoints = num_points / 4;
-+
-+ float* inputPtr = (float*)src0;
-+
-+ __m128 indexIncrementValues = _mm_set1_ps(4);
-+ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
-+
-+ float max = src0[0];
-+ float index = 0;
-+ __m128 maxValues = _mm_set1_ps(max);
-+ __m128 maxValuesIndex = _mm_setzero_ps();
-+ __m128 compareResults;
-+ __m128 currentValues;
-+
-+ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
-+ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
-+
-+ for(;number < quarterPoints; number++){
-+
-+ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
-+ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
-+
-+ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
-+
-+ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
-+
-+ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
-+ }
-+
-+ // Calculate the largest value from the remaining 4 points
-+ _mm_store_ps(maxValuesBuffer, maxValues);
-+ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
-+
-+ for(number = 0; number < 4; number++){
-+ if(maxValuesBuffer[number] > max){
-+ index = maxIndexesBuffer[number];
-+ max = maxValuesBuffer[number];
-+ }
-+ }
-+
-+ number = quarterPoints * 4;
-+ for(;number < num_points; number++){
-+ if(src0[number] > max){
-+ index = number;
-+ max = src0[number];
-+ }
-+ }
-+ target[0] = (uint32_t)index;
-+ }
-+}
-+
-+#endif /*LV_HAVE_SSE*/
-+
-+
-+#ifdef LV_HAVE_GENERIC
-+
-+static inline void
-+volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
-+{
-+ if(num_points > 0){
-+ float max = src0[0];
-+ uint32_t index = 0;
-+
-+ uint32_t i = 1;
-+
-+ for(; i < num_points; ++i) {
-+ if(src0[i] > max){
-+ index = i;
-+ max = src0[i];
-+ }
-+ }
-+ target[0] = index;
-+ }
-+}
-+
-+#endif /*LV_HAVE_GENERIC*/
-+
-+
-+#endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
---- kernels/volk/volk_32fc_index_max_16u.h.orig
-+++ kernels/volk/volk_32fc_index_max_16u.h
-@@ -28,9 +28,15 @@
- * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
- * maximum magnitude for complex points in the given vector.
- *
-+ * Note that num_points is a uint32_t, but the return value is
-+ * uint16_t. Providing a vector larger than the max of a uint16_t
-+ * (65536) would miss anything outside of this boundary. The kernel
-+ * will check the length of num_points and cap it to this max value,
-+ * anyways.
-+ *
- * <b>Dispatcher Prototype</b>
- * \code
-- * void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_points)
-+ * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
- * \endcode
- *
- * \b Inputs
-@@ -45,11 +51,11 @@
- * the unit circle.
- * \code
- * int N = 10;
-- * unsigned int alignment = volk_get_alignment();
-+ * uint32_t alignment = volk_get_alignment();
- * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
-- * uint32_t* max = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
-+ * uint16_t* max = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
- *
-- * for(unsigned int ii = 0; ii < N/2; ++ii){
-+ * for(uint32_t ii = 0; ii < N/2; ++ii){
- * float real = 2.f * ((float)ii / (float)N) - 1.f;
- * float imag = std::sqrt(1.f - real * real);
- * in[ii] = lv_cmake(real, imag);
-@@ -71,19 +77,24 @@
- #define INCLUDED_volk_32fc_index_max_16u_a_H
-
- #include <volk/volk_common.h>
--#include<inttypes.h>
--#include<stdio.h>
--#include<volk/volk_complex.h>
-+#include <inttypes.h>
-+#include <stdio.h>
-+#include <limits.h>
-+#include <volk/volk_complex.h>
-
- #ifdef LV_HAVE_SSE3
--#include<xmmintrin.h>
--#include<pmmintrin.h>
-+#include <xmmintrin.h>
-+#include <pmmintrin.h>
-
- static inline void
--volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0,
-- unsigned int num_points)
-+volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0,
-+ uint32_t num_points)
- {
-- const unsigned int num_bytes = num_points*8;
-+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-+ // Branchless version, if we think it'll make a difference
-+ //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
-+
-+ const uint32_t num_bytes = num_points*8;
-
- union bit128 holderf;
- union bit128 holderi;
-@@ -206,11 +217,11 @@ volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0,
- /*
- float placeholder = 0.0;
- uint32_t temp0, temp1;
-- unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
-- unsigned int l0 = g0 ^ 1;
-+ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
-+ uint32_t l0 = g0 ^ 1;
-
-- unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
-- unsigned int l1 = g1 ^ 1;
-+ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
-+ uint32_t l1 = g1 ^ 1;
-
- temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
- temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
-@@ -227,16 +238,18 @@ volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0,
-
- #ifdef LV_HAVE_GENERIC
- static inline void
-- volk_32fc_index_max_16u_generic(unsigned int* target, lv_32fc_t* src0,
-- unsigned int num_points)
-+ volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0,
-+ uint32_t num_points)
- {
-- const unsigned int num_bytes = num_points*8;
-+ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-+
-+ const uint32_t num_bytes = num_points*8;
-
- float sq_dist = 0.0;
- float max = 0.0;
-- unsigned int index = 0;
-+ uint16_t index = 0;
-
-- unsigned int i = 0;
-+ uint32_t i = 0;
-
- for(; i < num_bytes >> 3; ++i) {
- sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
---- /dev/null
-+++ kernels/volk/volk_32fc_index_max_32u.h
-@@ -0,0 +1,253 @@
-+/* -*- c++ -*- */
-+/*
-+ * Copyright 2016 Free Software Foundation, Inc.
-+ *
-+ * This file is part of GNU Radio
-+ *
-+ * GNU Radio is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 3, or (at your option)
-+ * any later version.
-+ *
-+ * GNU Radio is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-+ * GNU General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU General Public License
-+ * along with GNU Radio; see the file COPYING. If not, write to
-+ * the Free Software Foundation, Inc., 51 Franklin Street,
-+ * Boston, MA 02110-1301, USA.
-+ */
-+
-+/*!
-+ * \page volk_32fc_index_max_32u
-+ *
-+ * \b Overview
-+ *
-+ * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
-+ * maximum magnitude for complex points in the given vector.
-+ *
-+ * <b>Dispatcher Prototype</b>
-+ * \code
-+ * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
-+ * \endcode
-+ *
-+ * \b Inputs
-+ * \li src0: The complex input vector.
-+ * \li num_points: The number of samples.
-+ *
-+ * \b Outputs
-+ * \li target: The index of the point with maximum magnitude.
-+ *
-+ * \b Example
-+ * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
-+ * the unit circle.
-+ * \code
-+ * int N = 10;
-+ * uint32_t alignment = volk_get_alignment();
-+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
-+ * uint32_t* max = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
-+ *
-+ * for(uint32_t ii = 0; ii < N/2; ++ii){
-+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
-+ * float imag = std::sqrt(1.f - real * real);
-+ * in[ii] = lv_cmake(real, imag);
-+ * in[ii] = in[ii] * in[ii] + in[ii];
-+ * in[N-ii] = lv_cmake(real, imag);
-+ * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
-+ * }
-+ *
-+ * volk_32fc_index_max_32u(max, in, N);
-+ *
-+ * printf("index of max value = %u\n", *max);
-+ *
-+ * volk_free(in);
-+ * volk_free(max);
-+ * \endcode
-+ */
-+
-+#ifndef INCLUDED_volk_32fc_index_max_32u_a_H
-+#define INCLUDED_volk_32fc_index_max_32u_a_H
-+
-+#include <volk/volk_common.h>
-+#include<inttypes.h>
-+#include<stdio.h>
-+#include<volk/volk_complex.h>
-+
-+#ifdef LV_HAVE_SSE3
-+#include<xmmintrin.h>
-+#include<pmmintrin.h>
-+
-+static inline void
-+volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0,
-+ uint32_t num_points)
-+{
-+ const uint32_t num_bytes = num_points*8;
-+
-+ union bit128 holderf;
-+ union bit128 holderi;
-+ float sq_dist = 0.0;
-+
-+ union bit128 xmm5, xmm4;
-+ __m128 xmm1, xmm2, xmm3;
-+ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
-+
-+ xmm5.int_vec = xmmfive = _mm_setzero_si128();
-+ xmm4.int_vec = xmmfour = _mm_setzero_si128();
-+ holderf.int_vec = holder0 = _mm_setzero_si128();
-+ holderi.int_vec = holder1 = _mm_setzero_si128();
-+
-+ int bound = num_bytes >> 5;
-+ int leftovers0 = (num_bytes >> 4) & 1;
-+ int leftovers1 = (num_bytes >> 3) & 1;
-+ int i = 0;
-+
-+ xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
-+ xmm9 = xmm8 = _mm_setzero_si128();
-+ xmm10 = _mm_set_epi32(4, 4, 4, 4);
-+ xmm3 = _mm_setzero_ps();
-+
-+ //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
-+
-+ for(; i < bound; ++i) {
-+ xmm1 = _mm_load_ps((float*)src0);
-+ xmm2 = _mm_load_ps((float*)&src0[2]);
-+
-+ src0 += 4;
-+
-+ xmm1 = _mm_mul_ps(xmm1, xmm1);
-+ xmm2 = _mm_mul_ps(xmm2, xmm2);
-+
-+ xmm1 = _mm_hadd_ps(xmm1, xmm2);
-+
-+ xmm3 = _mm_max_ps(xmm1, xmm3);
-+
-+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
-+
-+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
-+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
-+
-+ xmm9 = _mm_add_epi32(xmm11, xmm12);
-+
-+ xmm8 = _mm_add_epi32(xmm8, xmm10);
-+
-+ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
-+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
-+ }
-+
-+
-+ for(i = 0; i < leftovers0; ++i) {
-+ xmm2 = _mm_load_ps((float*)src0);
-+
-+ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
-+ xmm8 = bit128_p(&xmm1)->int_vec;
-+
-+ xmm2 = _mm_mul_ps(xmm2, xmm2);
-+
-+ src0 += 2;
-+
-+ xmm1 = _mm_hadd_ps(xmm2, xmm2);
-+
-+ xmm3 = _mm_max_ps(xmm1, xmm3);
-+
-+ xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
-+
-+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
-+
-+ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
-+ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
-+
-+ xmm9 = _mm_add_epi32(xmm11, xmm12);
-+
-+ xmm8 = _mm_add_epi32(xmm8, xmm10);
-+ //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-+ }
-+
-+ for(i = 0; i < leftovers1; ++i) {
-+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-+
-+ sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
-+
-+ xmm2 = _mm_load1_ps(&sq_dist);
-+
-+ xmm1 = xmm3;
-+
-+ xmm3 = _mm_max_ss(xmm3, xmm2);
-+
-+ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
-+ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
-+
-+ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
-+
-+ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
-+ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
-+
-+ xmm9 = _mm_add_epi32(xmm11, xmm12);
-+ }
-+
-+ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
-+ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
-+
-+ _mm_store_ps((float*)&(holderf.f), xmm3);
-+ _mm_store_si128(&(holderi.int_vec), xmm9);
-+
-+ target[0] = holderi.i[0];
-+ sq_dist = holderf.f[0];
-+ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
-+ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
-+ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
-+ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
-+ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
-+ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
-+
-+ /*
-+ float placeholder = 0.0;
-+ uint32_t temp0, temp1;
-+ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
-+ uint32_t l0 = g0 ^ 1;
-+
-+ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
-+ uint32_t l1 = g1 ^ 1;
-+
-+ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
-+ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
-+ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
-+ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
-+
-+ g0 = (sq_dist > placeholder);
-+ l0 = g0 ^ 1;
-+ target[0] = g0 * temp0 + l0 * temp1;
-+ */
-+}
-+
-+#endif /*LV_HAVE_SSE3*/
-+
-+#ifdef LV_HAVE_GENERIC
-+static inline void
-+ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0,
-+ uint32_t num_points)
-+{
-+ const uint32_t num_bytes = num_points*8;
-+
-+ float sq_dist = 0.0;
-+ float max = 0.0;
-+ uint32_t index = 0;
-+
-+ uint32_t i = 0;
-+
-+ for(; i < num_bytes >> 3; ++i) {
-+ sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
-+
-+ index = sq_dist > max ? i : index;
-+ max = sq_dist > max ? sq_dist : max;
-+ }
-+ target[0] = index;
-+}
-+
-+#endif /*LV_HAVE_GENERIC*/
-+
-+
-+#endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
---- /dev/null
-+++ kernels/volk/volk_32fc_x2_divide_32fc.h
-@@ -0,0 +1,226 @@
-+/* -*- c++ -*- */
-+/*
-+ * Copyright 2016 Free Software Foundation, Inc.
-+ *
-+ * This file is part of GNU Radio
-+ *
-+ * GNU Radio is free software; you can redistribute it and/or modify
-+ * it under the terms of the GNU General Public License as published by
-+ * the Free Software Foundation; either version 3, or (at your option)
-+ * any later version.
-+ *
-+ * GNU Radio is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-+ * GNU General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU General Public License
-+ * along with GNU Radio; see the file COPYING. If not, write to
-+ * the Free Software Foundation, Inc., 51 Franklin Street,
-+ * Boston, MA 02110-1301, USA.
-+ */
-+
-+/*!
-+ * \page volk_32fc_x2_divide_32fc
-+ *
-+ * \b Overview
-+ *
-+ * Divide first vector of complexes element-wise by second.
-+ *
-+ * <b>Dispatcher Prototype</b>
-+ * \code
-+ * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, const lv_32fc_t* denumeratorVector, unsigned int num_points);
-+ * \endcode
-+ *
-+ * \b Inputs
-+ * \li numeratorVector: The numerator complex values.
-+ * \li numeratorVector: The denumerator complex values.
-+ * \li num_points: The number of data points.
-+ *
-+ * \b Outputs
-+ * \li outputVector: The output vector complex floats.
-+ *
-+ * \b Example
-+ * divide a complex vector by itself, demonstrating the result should be pretty close to 1+0j.
-+ *
-+ * \code
-+ * int N = 10;
-+ * unsigned int alignment = volk_get_alignment();
-+ * lv_32fc_t* input_vector = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
-+ * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
-+ *
-+ * float delta = 2.f*M_PI / (float)N;
-+ * for(unsigned int ii = 0; ii < N; ++ii){
-+ * float real_1 = std::cos(0.3f * (float)ii);
-+ * float imag_1 = std::sin(0.3f * (float)ii);
-+ * input_vector[ii] = lv_cmake(real_1, imag_1);
-+ * }
-+ *
-+ * volk_32fc_x2_divide_32fc(out, input_vector, input_vector, N);
-+ *
-+ * for(unsigned int ii = 0; ii < N; ++ii){
-+ * printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
-+ * }
-+ * printf("\n");
-+ *
-+ * volk_free(input_vector);
-+ * volk_free(out);
-+ * \endcode
-+ */
-+
-+#ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
-+#define INCLUDED_volk_32fc_x2_divide_32fc_u_H
-+
-+#include <inttypes.h>
-+#include <volk/volk_complex.h>
-+#include <float.h>
-+
-+#ifdef LV_HAVE_AVX
-+#include <immintrin.h>
-+#include <volk/volk_avx_intrinsics.h>
-+
-+static inline void
-+volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
-+ const lv_32fc_t* denumeratorVector, unsigned int num_points)
-+{
-+ /*
-+ * we'll do the "classical"
-+ * a a b*
-+ * --- = -------
-+ * b |b|^2
-+ * */
-+ unsigned int number = 0;
-+ const unsigned int quarterPoints = num_points / 4;
-+
-+ __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
-+ lv_32fc_t* c = cVector;
-+ const lv_32fc_t* a = numeratorVector;
-+ const lv_32fc_t* b = denumeratorVector;
-+
-+ for(; number < quarterPoints; number++){
-+ num = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
-+ denum = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
-+ mul_conj = _mm256_complexconjugatemul_ps(num, denum);
-+ sq = _mm256_mul_ps(denum, denum); // Square the values
-+ mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
-+ mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
-+ // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
-+ div = _mm256_div_ps(mul_conj,mag_sq);
-+
-+ _mm256_storeu_ps((float*) c, div); // Store the results back into the C container
-+
-+ a += 4;
-+ b += 4;
-+ c += 4;
-+ }
-+
-+ number = quarterPoints * 4;
-+
-+ for(; number < num_points; number++){
-+ *c++ = (*a++) / (*b++);
-+ }
-+
-+}
-+#endif /* LV_HAVE_AVX */
-+
-+
-+#ifdef LV_HAVE_GENERIC
-+
-+static inline void
-+volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-+ const lv_32fc_t* bVector, unsigned int num_points)
-+{
-+ lv_32fc_t* cPtr = cVector;
-+ const lv_32fc_t* aPtr = aVector;
-+ const lv_32fc_t* bPtr= bVector;
-+ unsigned int number = 0;
-+
-+ for(number = 0; number < num_points; number++){
-+ *cPtr++ = (*aPtr++) / (*bPtr++);
-+ }
-+}
-+#endif /* LV_HAVE_GENERIC */
-+
-+
-+
-+#endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
-+
-+
-+#ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
-+#define INCLUDED_volk_32fc_x2_divide_32fc_a_H
-+
-+#include <inttypes.h>
-+#include <stdio.h>
-+#include <volk/volk_complex.h>
-+#include <float.h>
-+
-+
-+#ifdef LV_HAVE_AVX
-+#include <immintrin.h>
-+#include <volk/volk_avx_intrinsics.h>
-+
-+static inline void
-+volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
-+ const lv_32fc_t* denumeratorVector, unsigned int num_points)
-+{
-+ /*
-+ * we'll do the "classical"
-+ * a a b*
-+ * --- = -------
-+ * b |b|^2
-+ * */
-+ unsigned int number = 0;
-+ const unsigned int quarterPoints = num_points / 4;
-+
-+ __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
-+ lv_32fc_t* c = cVector;
-+ const lv_32fc_t* a = numeratorVector;
-+ const lv_32fc_t* b = denumeratorVector;
-+
-+ for(; number < quarterPoints; number++){
-+ num = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
-+ denum = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
-+ mul_conj = _mm256_complexconjugatemul_ps(num, denum);
-+ sq = _mm256_mul_ps(denum, denum); // Square the values
-+ mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
-+ mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
-+ // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
-+ div = _mm256_div_ps(mul_conj,mag_sq);
-+
-+ _mm256_store_ps((float*) c, div); // Store the results back into the C container
-+
-+ a += 4;
-+ b += 4;
-+ c += 4;
-+ }
-+
-+ number = quarterPoints * 4;
-+
-+ for(; number < num_points; number++){
-+ *c++ = (*a++) / (*b++);
-+ }
-+
-+
-+}
-+#endif /* LV_HAVE_AVX */
-+
-+
-+#ifdef LV_HAVE_GENERIC
-+
-+static inline void
-+volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
-+ const lv_32fc_t* bVector, unsigned int num_points)
-+{
-+ lv_32fc_t* cPtr = cVector;
-+ const lv_32fc_t* aPtr = aVector;
-+ const lv_32fc_t* bPtr= bVector;
-+ unsigned int number = 0;
-+
-+ for(number = 0; number < num_points; number++){
-+ *cPtr++ = (*aPtr++) / (*bPtr++);
-+ }
-+}
-+#endif /* LV_HAVE_GENERIC */
-+
-+
-+#endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
---- lib/CMakeLists.txt.orig
-+++ lib/CMakeLists.txt
-@@ -383,7 +383,7 @@ foreach(machine_name ${available_machines})
- )
- MESSAGE(STATUS "BUILD INFO ::: ${machine_name} ::: ${COMPILER_NAME} ::: ${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}")
- set(COMPILER_INFO "${COMPILER_INFO}${machine_name}:::${COMPILER_NAME}:::${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}\n" )
-- if(${machine_name}_flags)
-+ if(${machine_name}_flags AND NOT MSVC)
- set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${${machine_name}_flags}")
- endif()
-
---- lib/kernel_tests.h.orig
-+++ lib/kernel_tests.h
-@@ -50,6 +50,7 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
- (VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc))
- (VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params))
- (VOLK_INIT_TEST(volk_32f_index_max_16u, test_params))
-+ (VOLK_INIT_TEST(volk_32f_index_max_32u, test_params))
- (VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params))
- (VOLK_INIT_TEST(volk_32f_log2_32f, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
- (VOLK_INIT_TEST(volk_32f_expfast_32f, volk_test_params_t(1e-1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
-@@ -73,11 +74,13 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
- (VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc))
- (VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc))
- (VOLK_INIT_TEST(volk_32fc_index_max_16u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
-+ (VOLK_INIT_TEST(volk_32fc_index_max_32u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
- (VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params_int1))
- (VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc))
- (VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params))
- (VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params))
- (VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params))
-+ (VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params))
- (VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params))
- (VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params))
- (VOLK_INIT_TEST(volk_32f_s32f_convert_32i, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
---- lib/volk_rank_archs.c.orig
-+++ lib/volk_rank_archs.c
-@@ -38,7 +38,7 @@ int volk_get_index(
- }
- //TODO return -1;
- //something terrible should happen here
-- printf("Volk warning: no arch found, returning generic impl\n");
-+ fprintf(stderr, "Volk warning: no arch found, returning generic impl\n");
- return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
- }
-
---- tmpl/volk.tmpl.c.orig
-+++ tmpl/volk.tmpl.c
-@@ -53,7 +53,7 @@ struct volk_machine *get_machine(void)
- }
- }
- machine = max_machine;
-- printf("Using Volk machine: %s\n", machine->name);
-+ //printf("Using Volk machine: %s\n", machine->name);
- __alignment = machine->alignment;
- __alignment_mask = (intptr_t)(__alignment-1);
- return machine;
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.macosforge.org/pipermail/macports-changes/attachments/20160704/5917b19b/attachment-0001.html>
More information about the macports-changes
mailing list