19 files changed, 215 insertions, 9 deletions
diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am
index 814d438fd..253033461 100644
--- a/volk/lib/Makefile.am
+++ b/volk/lib/Makefile.am
@@ -45,7 +45,9 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \
 
 # list of programs run by "make check" and "make distcheck"
 #TESTS = test_all
-
+#orc stuff gets built in the ORC directory conditional to ORC being enabled.
+#it gets linked in during the build of libvolk as an added library.
+#there might be a better way to do this.
 
 lib_LTLIBRARIES = \
 	libvolk.la \
@@ -131,13 +133,22 @@ libvolk_runtime_la_SOURCES =	\
 	$(universal_runtime_CODE)
 endif
 
-
-
-libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 
+volk_orc_LDFLAGS = \
+	$(ORC_LDFLAGS) \
+	-lorc-0.4
+	
+volk_orc_LIBADD = \
+	../orc/libvolk_orc.la
+
+if LV_HAVE_ORC
+libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS)
+libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS)
+libvolk_la_LIBADD = $(volk_orc_LIBADD)
+else
+libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
 libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
-
 libvolk_la_LIBADD =
-
+endif
 
 
 # ----------------------------------------------------------------
@@ -233,7 +244,7 @@ libvolk_qa_la_SOURCES = \
 	qa_32f_stddev_aligned16.cc \
 	qa_32f_stddev_and_mean_aligned16.cc
 
-libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 
+libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
 
 libvolk_qa_la_LIBADD = \
 	libvolk.la \
diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc
index c775e8596..aadc39067 100644
--- a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc
+++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc
@@ -27,6 +27,8 @@ void qa_16sc_deinterleave_16s_aligned16::t1() {
   int16_t output_generic1[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse2[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse21[vlen] __attribute__ ((aligned (16)));
+  int16_t output_orc[vlen] __attribute__ ((aligned (16)));
+  int16_t output_orc1[vlen] __attribute__ ((aligned (16)));
   int16_t output_ssse3[vlen] __attribute__ ((aligned (16)));
   int16_t output_ssse31[vlen] __attribute__ ((aligned (16)));
 
@@ -45,6 +47,13 @@ void qa_16sc_deinterleave_16s_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_16s_aligned16_manual(output_orc, output_orc1, input0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_16sc_deinterleave_16s_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2");
   }
   end = clock();
@@ -71,6 +80,9 @@ void qa_16sc_deinterleave_16s_aligned16::t1() {
 
     CPPUNIT_ASSERT_EQUAL(output_generic[i],  output_ssse3[i]);
     CPPUNIT_ASSERT_EQUAL(output_generic1[i],  output_ssse31[i]);
+    
+    CPPUNIT_ASSERT_EQUAL(output_generic[i],  output_orc[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic1[i],  output_orc1[i]);
   }
 }
 
diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc
index b25094e89..13151be13 100644
--- a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc
+++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc
@@ -27,6 +27,8 @@ void qa_16sc_deinterleave_32f_aligned16::t1() {
   float output_generic1[vlen] __attribute__ ((aligned (16)));
   float output_sse2[vlen] __attribute__ ((aligned (16)));
   float output_sse21[vlen] __attribute__ ((aligned (16)));
+  float output_orc[vlen] __attribute__ ((aligned (16)));
+  float output_orc1[vlen] __attribute__ ((aligned (16)));
 
   int16_t* loadInput = (int16_t*)input0;
   for(int i = 0; i < vlen*2; ++i) {   
@@ -43,6 +45,13 @@ void qa_16sc_deinterleave_32f_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_32f_aligned16_manual(output_orc, output_orc1, input0, 32768.0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_16sc_deinterleave_32f_aligned16_manual(output_sse2, output_sse21, input0, 32768.0, vlen, "sse");
   }
   end = clock();
@@ -58,6 +67,8 @@ void qa_16sc_deinterleave_32f_aligned16::t1() {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i],  output_sse21[i], fabs(output_generic1[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i],  output_orc1[i], fabs(output_generic1[i])*1e-4);
   }
 }
 
diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc
index dd446567e..803caaa2d 100644
--- a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc
+++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc
@@ -25,6 +25,7 @@ void qa_16sc_deinterleave_real_8s_aligned16::t1() {
   
   int8_t output_generic[vlen] __attribute__ ((aligned (16)));
   int8_t output_ssse3[vlen] __attribute__ ((aligned (16)));
+  int8_t output_orc[vlen] __attribute__ ((aligned (16)));
 
   int16_t* loadInput = (int16_t*)input0;
   for(int i = 0; i < vlen*2; ++i) {   
@@ -41,6 +42,13 @@ void qa_16sc_deinterleave_real_8s_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_real_8s_aligned16_manual(output_orc, input0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_16sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3");
   }
   end = clock();
@@ -55,6 +63,7 @@ void qa_16sc_deinterleave_real_8s_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_orc[i]);
   }
 }
 
diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
index 9799ef43b..7fbdd8620 100644
--- a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
+++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
@@ -24,6 +24,7 @@ void qa_16sc_magnitude_16s_aligned16::t1() {
   std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
   
   int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_orc[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse3[vlen] __attribute__ ((aligned (16)));
 
@@ -42,6 +43,14 @@ void qa_16sc_magnitude_16s_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_16s_aligned16_manual(output_orc, input0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse");
   }
   end = clock();
@@ -65,6 +74,7 @@ void qa_16sc_magnitude_16s_aligned16::t1() {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1);
   }
 }
 
diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
index 1ebe644c5..54cc2ba6e 100644
--- a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
+++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
@@ -16,6 +16,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
   std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
   
   float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_orc[vlen] __attribute__ ((aligned (16)));
   float output_known[vlen] __attribute__ ((aligned (16)));
 
   int16_t* inputLoad = (int16_t*)input0;
@@ -38,6 +39,14 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
+  
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_32f_aligned16_manual(output_orc, input0, scale, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
 
   /*
   for(int i = 0; i < 100; ++i) {
@@ -49,6 +58,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_known[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_orc[i], output_known[i], fabs(output_generic[i])*1e-4);
   }
 }
 
@@ -64,6 +74,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
   std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
   
   float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_orc[vlen] __attribute__ ((aligned (16)));
   float output_sse[vlen] __attribute__ ((aligned (16)));
   float output_sse3[vlen] __attribute__ ((aligned (16)));
 
@@ -80,6 +91,14 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
+/*  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_32f_aligned16_manual(output_orc, input0, 32768.0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+*/
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
     volk_16sc_magnitude_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
@@ -105,6 +124,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4);
+//    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4);
   }
 }
 
diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc
index ea117a820..c2295968b 100644
--- a/volk/lib/qa_16u_byteswap_aligned16.cc
+++ b/volk/lib/qa_16u_byteswap_aligned16.cc
@@ -25,11 +25,13 @@ void qa_16u_byteswap_aligned16::t1() {
   
   uint16_t output0[vlen] __attribute__ ((aligned (16)));
   uint16_t output01[vlen] __attribute__ ((aligned (16)));
+  uint16_t output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     output0[i] = (uint16_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2));
   }
   memcpy(output01, output0, vlen*sizeof(uint16_t));
+  memcpy(output02, output0, vlen*sizeof(uint16_t));
 
   printf("16u_byteswap_aligned\n");
 
@@ -42,6 +44,13 @@ void qa_16u_byteswap_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_16u_byteswap_aligned16_manual(output02, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_16u_byteswap_aligned16_manual(output01, vlen, "sse2");
   }
   end = clock();
@@ -55,6 +64,7 @@ void qa_16u_byteswap_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);    
   }
 }
 
diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc
index f80d562d4..a183d4d85 100644
--- a/volk/lib/qa_32f_add_aligned16.cc
+++ b/volk/lib/qa_32f_add_aligned16.cc
@@ -79,6 +79,7 @@ void qa_32f_add_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -95,6 +96,13 @@ void qa_32f_add_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_32f_add_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_32f_add_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
   end = clock();
@@ -108,6 +116,7 @@ void qa_32f_add_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc
index 3257a3751..f2a1b9e7f 100644
--- a/volk/lib/qa_32f_divide_aligned16.cc
+++ b/volk/lib/qa_32f_divide_aligned16.cc
@@ -36,6 +36,7 @@ void qa_32f_divide_aligned16::t1() {
   float input1[vlen] __attribute__ ((aligned (16)));
   
   float output0[vlen] __attribute__ ((aligned (16)));
+  float output1[vlen] __attribute__ ((aligned (16)));
   float output_known[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
@@ -52,6 +53,14 @@ void qa_32f_divide_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
+  
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_divide_aligned16_manual(output1, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
 
   /*
   for(int i = 0; i < 10; ++i) {
@@ -62,6 +71,7 @@ void qa_32f_divide_aligned16::t1() {
   
   for(int i = 0; i < vlen; ++i) {
     CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]);
+    CPPUNIT_ASSERT_EQUAL(output1[i], output_known[i]);
   }
 }
 
@@ -79,6 +89,7 @@ void qa_32f_divide_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -95,6 +106,13 @@ void qa_32f_divide_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_32f_divide_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_32f_divide_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
   end = clock();
@@ -108,6 +126,7 @@ void qa_32f_divide_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32f_max_aligned16.cc b/volk/lib/qa_32f_max_aligned16.cc
index ceb913cb4..98f8ce9bc 100644
--- a/volk/lib/qa_32f_max_aligned16.cc
+++ b/volk/lib/qa_32f_max_aligned16.cc
@@ -26,6 +26,7 @@ void qa_32f_max_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -42,6 +43,13 @@ void qa_32f_max_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_32f_max_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_32f_max_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
   end = clock();
@@ -55,6 +63,7 @@ void qa_32f_max_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32f_min_aligned16.cc b/volk/lib/qa_32f_min_aligned16.cc
index 580a60e7d..798b47c53 100644
--- a/volk/lib/qa_32f_min_aligned16.cc
+++ b/volk/lib/qa_32f_min_aligned16.cc
@@ -26,6 +26,7 @@ void qa_32f_min_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -42,6 +43,13 @@ void qa_32f_min_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_32f_min_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_32f_min_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
   end = clock();
@@ -55,6 +63,7 @@ void qa_32f_min_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc
index 0c242b649..aa17cd62e 100644
--- a/volk/lib/qa_32f_multiply_aligned16.cc
+++ b/volk/lib/qa_32f_multiply_aligned16.cc
@@ -79,6 +79,7 @@ void qa_32f_multiply_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -95,6 +96,13 @@ void qa_32f_multiply_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_32f_multiply_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_32f_multiply_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
   end = clock();
@@ -108,6 +116,7 @@ void qa_32f_multiply_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc
index 62d55767a..c216ce5d5 100644
--- a/volk/lib/qa_32f_sqrt_aligned16.cc
+++ b/volk/lib/qa_32f_sqrt_aligned16.cc
@@ -53,6 +53,14 @@ void qa_32f_sqrt_aligned16::t1() {
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
   
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  
   /*
   for(int i = 0; i < 10; ++i) {
     printf("inputs: %f\n", input0[i]);
@@ -94,6 +102,13 @@ void qa_32f_sqrt_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_32f_sqrt_aligned16_manual(output01, input0, vlen, "sse");
   }
   end = clock();
diff --git a/volk/lib/qa_32f_subtract_aligned16.cc b/volk/lib/qa_32f_subtract_aligned16.cc
index ffe4b504c..1e2210203 100644
--- a/volk/lib/qa_32f_subtract_aligned16.cc
+++ b/volk/lib/qa_32f_subtract_aligned16.cc
@@ -26,6 +26,7 @@ void qa_32f_subtract_aligned16::t1() {
   
   float output0[vlen] __attribute__ ((aligned (16)));
   float output01[vlen] __attribute__ ((aligned (16)));
+  float output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
@@ -42,6 +43,13 @@ void qa_32f_subtract_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_32f_subtract_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_32f_subtract_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
   end = clock();
@@ -55,6 +63,7 @@ void qa_32f_subtract_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
index 16984e30d..c718b6b71 100644
--- a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
+++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
@@ -24,6 +24,7 @@ void qa_32fc_magnitude_16s_aligned16::t1() {
   std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
   
   int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_orc[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse[vlen] __attribute__ ((aligned (16)));
   int16_t output_sse3[vlen] __attribute__ ((aligned (16)));
 
@@ -42,6 +43,13 @@ void qa_32fc_magnitude_16s_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_magnitude_16s_aligned16_manual(output_orc, input0, 32768.0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_32fc_magnitude_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
   }
   end = clock();
@@ -57,14 +65,15 @@ void qa_32fc_magnitude_16s_aligned16::t1() {
   printf("sse3_time: %f\n", total);
 
   for(int i = 0; i < 1; ++i) {
-    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
-    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  //  printf("inputs: %f, %f\n", input0[i].real(), input0[i].imag());
+  //  printf("generic... %i, sse3... %i, orc... %i\n", output_generic[i], output_sse3[i], output_orc[i]);
   }
   
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1);
   }
 }
 
diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc
index b99f1ddcf..1d475fb86 100644
--- a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc
+++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc
@@ -24,6 +24,7 @@ void qa_32fc_magnitude_32f_aligned16::t1() {
   std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
   
   float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_orc[vlen] __attribute__ ((aligned (16)));
   float output_sse[vlen] __attribute__ ((aligned (16)));
   float output_sse3[vlen] __attribute__ ((aligned (16)));
 
@@ -42,6 +43,13 @@ void qa_32fc_magnitude_32f_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_magnitude_32f_aligned16_manual(output_orc, input0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_32fc_magnitude_32f_aligned16_manual(output_sse, input0, vlen, "sse");
   }
   end = clock();
@@ -65,6 +73,7 @@ void qa_32fc_magnitude_32f_aligned16::t1() {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
     CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4);
   }
 }
 
diff --git a/volk/lib/qa_32s_and_aligned16.cc b/volk/lib/qa_32s_and_aligned16.cc
index 661801709..d20682147 100644
--- a/volk/lib/qa_32s_and_aligned16.cc
+++ b/volk/lib/qa_32s_and_aligned16.cc
@@ -26,6 +26,7 @@ void qa_32s_and_aligned16::t1() {
   
   int32_t output0[vlen] __attribute__ ((aligned (16)));
   int32_t output01[vlen] __attribute__ ((aligned (16)));
+  int32_t output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((int32_t) (rand() - (RAND_MAX/2)));
@@ -42,6 +43,13 @@ void qa_32s_and_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_32s_and_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_32s_and_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
   end = clock();
@@ -55,6 +63,7 @@ void qa_32s_and_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_32s_or_aligned16.cc b/volk/lib/qa_32s_or_aligned16.cc
index 9da2ae344..bebf779b0 100644
--- a/volk/lib/qa_32s_or_aligned16.cc
+++ b/volk/lib/qa_32s_or_aligned16.cc
@@ -26,6 +26,7 @@ void qa_32s_or_aligned16::t1() {
   
   int32_t output0[vlen] __attribute__ ((aligned (16)));
   int32_t output01[vlen] __attribute__ ((aligned (16)));
+  int32_t output02[vlen] __attribute__ ((aligned (16)));
 
   for(int i = 0; i < vlen; ++i) {   
     input0[i] = ((int32_t) (rand() - (RAND_MAX/2)));
@@ -42,6 +43,13 @@ void qa_32s_or_aligned16::t1() {
   printf("generic_time: %f\n", total);
   start = clock();
   for(int count = 0; count < ITERS; ++count) {
+    volk_32s_or_aligned16_manual(output02, input0, input1, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
     volk_32s_or_aligned16_manual(output01, input0, input1, vlen, "sse");
   }
   end = clock();
@@ -55,6 +63,7 @@ void qa_32s_or_aligned16::t1() {
   for(int i = 0; i < vlen; ++i) {
     //printf("%d...%d\n", output0[i], output01[i]);
     CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]);
   }
 }
 
diff --git a/volk/lib/qa_8s_convert_32f_aligned16.cc b/volk/lib/qa_8s_convert_32f_aligned16.cc
index 672f5662f..f27e60552 100644
--- a/volk/lib/qa_8s_convert_32f_aligned16.cc
+++ b/volk/lib/qa_8s_convert_32f_aligned16.cc
@@ -41,6 +41,14 @@ void qa_8s_convert_32f_aligned16::t1() {
   end = clock();
   total = (double)(end-start)/(double)CLOCKS_PER_SEC;
   printf("generic_time: %f\n", total);
+  
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8s_convert_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "orc");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("orc_time: %f\n", total);
 
   start = clock();
   for(int count = 0; count < ITERS; ++count) {