14#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
21 static const unsigned int b[] = {
22 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
25 unsigned int res = (
val & b[0]) != 0;
26 res |= ((
val & b[4]) != 0) << 4;
27 res |= ((
val & b[3]) != 0) << 3;
28 res |= ((
val & b[2]) != 0) << 2;
29 res |= ((
val & b[1]) != 0) << 1;
34 const unsigned char* temp_ptr,
35 const unsigned int num_branches,
36 const unsigned int frame_half)
38 unsigned int branch, bit;
39 for (branch = 0; branch < num_branches; ++branch) {
40 for (bit = 0; bit < frame_half; ++bit) {
41 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42 *(frame_ptr + frame_half) = *(temp_ptr + 1);
46 frame_ptr += frame_half;
54 unsigned int frame_size)
57 unsigned int frame_half = frame_size >> 1;
58 unsigned int num_branches = 1;
63 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
66 num_branches = num_branches << 1;
67 frame_half = frame_half >> 1;
78 unsigned int frame_size)
82 unsigned int stage = po2;
83 unsigned char* frame_ptr = frame;
84 unsigned char* temp_ptr = temp;
86 unsigned int frame_half = frame_size >> 1;
87 unsigned int num_branches = 1;
110 __m128i r_frame0, r_temp0, shifted;
114 const __m128i shuffle_separate =
115 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
122 for (branch = 0; branch < num_branches; ++branch) {
123 for (bit = 0; bit < frame_half; bit += 16) {
147 frame_ptr += frame_half;
149 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
151 num_branches = num_branches << 1;
152 frame_half = frame_half >> 1;
168 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
218 for (branch = 0; branch < num_branches; ++branch) {
253#include <immintrin.h>
255static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(
unsigned char* frame,
257 unsigned int frame_size)
261 unsigned int stage = po2;
262 unsigned char* frame_ptr = frame;
263 unsigned char* temp_ptr = temp;
265 unsigned int frame_half = frame_size >> 1;
266 unsigned int num_branches = 1;
271 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
321 __m256i r_frame0, r_temp0, shifted;
322 __m128i r_temp2, r_frame2, shifted2;
324 __m256i r_frame1, r_temp1;
326 const __m256i shuffle_separate = _mm256_setr_epi8(0,
358 const __m128i shuffle_separate128 =
359 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
366 for (branch = 0; branch < num_branches; ++branch) {
367 for (bit = 0; bit < frame_half; bit += 32) {
368 if ((frame_half - bit) <
394 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
396 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
399 shifted = _mm256_srli_si256(r_temp0, 1);
400 shifted = _mm256_and_si256(shifted, mask_stage1);
401 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
402 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
404 shifted = _mm256_srli_si256(r_temp1, 1);
405 shifted = _mm256_and_si256(shifted, mask_stage1);
406 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
407 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
409 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
410 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
411 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
412 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
414 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
416 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
420 frame_ptr += frame_half;
422 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
424 num_branches = num_branches << 1;
425 frame_half = frame_half >> 1;
440 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
472 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
504 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
536 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
569 for (branch = 0; branch < num_branches / 2; ++branch) {
570 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
577 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
579 shifted = _mm256_srli_si256(r_temp0, 8);
580 shifted = _mm256_and_si256(shifted, mask_stage4);
581 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
584 shifted = _mm256_srli_si256(r_frame0, 4);
585 shifted = _mm256_and_si256(shifted, mask_stage3);
586 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
588 shifted = _mm256_srli_si256(r_frame0, 2);
589 shifted = _mm256_and_si256(shifted, mask_stage2);
590 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
592 shifted = _mm256_srli_si256(r_frame0, 1);
593 shifted = _mm256_and_si256(shifted, mask_stage1);
594 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
597 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
605#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
606#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
609#include <tmmintrin.h>
613 unsigned int frame_size)
617 unsigned int stage = po2;
618 unsigned char* frame_ptr = frame;
619 unsigned char* temp_ptr = temp;
621 unsigned int frame_half = frame_size >> 1;
622 unsigned int num_branches = 1;
645 __m128i r_frame0, r_temp0, shifted;
649 const __m128i shuffle_separate =
650 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
657 for (branch = 0; branch < num_branches; ++branch) {
658 for (bit = 0; bit < frame_half; bit += 16) {
682 frame_ptr += frame_half;
684 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
686 num_branches = num_branches << 1;
687 frame_half = frame_half >> 1;
703 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
753 for (branch = 0; branch < num_branches; ++branch) {
787#include <immintrin.h>
789static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(
unsigned char* frame,
791 unsigned int frame_size)
795 unsigned int stage = po2;
796 unsigned char* frame_ptr = frame;
797 unsigned char* temp_ptr = temp;
799 unsigned int frame_half = frame_size >> 1;
800 unsigned int num_branches = 1;
805 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
855 __m256i r_frame0, r_temp0, shifted;
856 __m128i r_temp2, r_frame2, shifted2;
858 __m256i r_frame1, r_temp1;
860 const __m256i shuffle_separate = _mm256_setr_epi8(0,
892 const __m128i shuffle_separate128 =
893 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
900 for (branch = 0; branch < num_branches; ++branch) {
901 for (bit = 0; bit < frame_half; bit += 32) {
902 if ((frame_half - bit) <
928 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
930 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
933 shifted = _mm256_srli_si256(r_temp0, 1);
934 shifted = _mm256_and_si256(shifted, mask_stage1);
935 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
936 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
938 shifted = _mm256_srli_si256(r_temp1, 1);
939 shifted = _mm256_and_si256(shifted, mask_stage1);
940 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
941 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
943 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
944 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
945 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
946 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
948 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
950 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
954 frame_ptr += frame_half;
956 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
958 num_branches = num_branches << 1;
959 frame_half = frame_half >> 1;
974 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
1006 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1038 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1070 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1103 for (branch = 0; branch < num_branches / 2; ++branch) {
1104 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1111 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1113 shifted = _mm256_srli_si256(r_temp0, 8);
1114 shifted = _mm256_and_si256(shifted, mask_stage4);
1115 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1117 shifted = _mm256_srli_si256(r_frame0, 4);
1118 shifted = _mm256_and_si256(shifted, mask_stage3);
1119 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1121 shifted = _mm256_srli_si256(r_frame0, 2);
1122 shifted = _mm256_and_si256(shifted, mask_stage2);
1123 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1125 shifted = _mm256_srli_si256(r_frame0, 1);
1126 shifted = _mm256_and_si256(shifted, mask_stage1);
1127 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1130 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);