From c6bc30bab73781c3d02f66d4d636045328fd190f Mon Sep 17 00:00:00 2001
From: strobe <strobe@google.com>
Date: Mon, 25 Sep 2017 09:41:58 -0700
Subject: [PATCH] Add NEON-accelerated HDR conversion routines to VPX.

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=169919087
---
 extensions/vp9/src/main/jni/vpx_jni.cc | 265 +++++++++++++++++++++----
 1 file changed, 223 insertions(+), 42 deletions(-)

diff --git a/extensions/vp9/src/main/jni/vpx_jni.cc b/extensions/vp9/src/main/jni/vpx_jni.cc
index f0b93b1dc2..5c480d1525 100644
--- a/extensions/vp9/src/main/jni/vpx_jni.cc
+++ b/extensions/vp9/src/main/jni/vpx_jni.cc
@@ -15,6 +15,9 @@
  */
 
 #include <cpu-features.h>
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+#endif
 #include <jni.h>
 
 #include <android/log.h>
@@ -70,6 +73,216 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) {
   return JNI_VERSION_1_6;
 }
 
+#ifdef __ARM_NEON__
+static int convert_16_to_8_neon(const vpx_image_t* const img, jbyte* const data,
+                                const int32_t uvHeight, const int32_t yLength,
+                                const int32_t uvLength) {
+  if (!(android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON)) return 0;
+  uint32x2_t lcg_val = vdup_n_u32(random());
+  lcg_val = vset_lane_u32(random(), lcg_val, 1);
+  // LCG values recommended in good ol' "Numerical Recipes"
+  const uint32x2_t LCG_MULT = vdup_n_u32(1664525);
+  const uint32x2_t LCG_INCR = vdup_n_u32(1013904223);
+
+  const uint16_t* srcBase =
+      reinterpret_cast<uint16_t*>(img->planes[VPX_PLANE_Y]);
+  uint8_t* dstBase = reinterpret_cast<uint8_t*>(data);
+  // In units of uint16_t, so /2 from raw stride
+  const int srcStride = img->stride[VPX_PLANE_Y] / 2;
+  const int dstStride = img->stride[VPX_PLANE_Y];
+
+  for (int y = 0; y < img->d_h; y++) {
+    const uint16_t* src = srcBase;
+    uint8_t* dst = dstBase;
+
+    // Each read consumes 4 2-byte samples, but to reduce branches and
+    // random steps we unroll to four rounds, so each loop consumes 16
+    // samples.
+    const int imax = img->d_w & ~15;
+    int i;
+    for (i = 0; i < imax; i += 16) {
+      // Run a round of the RNG.
+      lcg_val = vmla_u32(LCG_INCR, lcg_val, LCG_MULT);
+
+      // The lower two bits of this LCG parameterization are garbage,
+      // leaving streaks on the image. We access the upper bits of each
+      // 16-bit lane by shifting. (We use this both as an 8- and 16-bit
+      // vector, so the choice of which one to keep it as is arbitrary.)
+      uint8x8_t randvec =
+          vreinterpret_u8_u16(vshr_n_u16(vreinterpret_u16_u32(lcg_val), 8));
+
+      // We retrieve the values and shift them so that the bits we'll
+      // shift out (after biasing) are in the upper 8 bits of each 16-bit
+      // lane.
+      uint16x4_t values = vshl_n_u16(vld1_u16(src), 6);
+      src += 4;
+
+      // We add the bias bits in the lower 8 to the shifted values to get
+      // the final values in the upper 8 bits.
+      uint16x4_t added1 = vqadd_u16(values, vreinterpret_u16_u8(randvec));
+
+      // Shifting the randvec bits left by 2 bits, as an 8-bit vector,
+      // should leave us with enough bias to get the needed rounding
+      // operation.
+      randvec = vshl_n_u8(randvec, 2);
+
+      // Retrieve and sum the next 4 pixels.
+      values = vshl_n_u16(vld1_u16(src), 6);
+      src += 4;
+      uint16x4_t added2 = vqadd_u16(values, vreinterpret_u16_u8(randvec));
+
+      // Reinterpret the two added vectors as 8x8, zip them together, and
+      // discard the lower portions.
+      uint8x8_t zipped =
+          vuzp_u8(vreinterpret_u8_u16(added1), vreinterpret_u8_u16(added2))
+              .val[1];
+      vst1_u8(dst, zipped);
+      dst += 8;
+
+      // Run it again with the next two rounds using the remaining
+      // entropy in randvec.
+      randvec = vshl_n_u8(randvec, 2);
+      values = vshl_n_u16(vld1_u16(src), 6);
+      src += 4;
+      added1 = vqadd_u16(values, vreinterpret_u16_u8(randvec));
+      randvec = vshl_n_u8(randvec, 2);
+      values = vshl_n_u16(vld1_u16(src), 6);
+      src += 4;
+      added2 = vqadd_u16(values, vreinterpret_u16_u8(randvec));
+      zipped = vuzp_u8(vreinterpret_u8_u16(added1), vreinterpret_u8_u16(added2))
+                   .val[1];
+      vst1_u8(dst, zipped);
+      dst += 8;
+    }
+
+    uint32_t randval = 0;
+    // For the remaining pixels in each row - usually none, as most
+    // standard sizes are divisible by 32 - convert them "by hand".
+    while (i < img->d_w) {
+      if (!randval) randval = random();
+      dstBase[i] = (srcBase[i] + (randval & 3)) >> 2;
+      i++;
+      randval >>= 2;
+    }
+
+    srcBase += srcStride;
+    dstBase += dstStride;
+  }
+
+  const uint16_t* srcUBase =
+      reinterpret_cast<uint16_t*>(img->planes[VPX_PLANE_U]);
+  const uint16_t* srcVBase =
+      reinterpret_cast<uint16_t*>(img->planes[VPX_PLANE_V]);
+  const int32_t uvWidth = (img->d_w + 1) / 2;
+  uint8_t* dstUBase = reinterpret_cast<uint8_t*>(data + yLength);
+  uint8_t* dstVBase = reinterpret_cast<uint8_t*>(data + yLength + uvLength);
+  const int srcUVStride = img->stride[VPX_PLANE_V] / 2;
+  const int dstUVStride = img->stride[VPX_PLANE_V];
+
+  for (int y = 0; y < uvHeight; y++) {
+    const uint16_t* srcU = srcUBase;
+    const uint16_t* srcV = srcVBase;
+    uint8_t* dstU = dstUBase;
+    uint8_t* dstV = dstVBase;
+
+    // As before, each i++ consumes 4 samples (8 bytes). For simplicity we
+    // don't unroll these loops more than we have to, which is 8 samples.
+    const int imax = uvWidth & ~7;
+    int i;
+    for (i = 0; i < imax; i += 8) {
+      lcg_val = vmla_u32(LCG_INCR, lcg_val, LCG_MULT);
+      uint8x8_t randvec =
+          vreinterpret_u8_u16(vshr_n_u16(vreinterpret_u16_u32(lcg_val), 8));
+      uint16x4_t uVal1 = vqadd_u16(vshl_n_u16(vld1_u16(srcU), 6),
+                                   vreinterpret_u16_u8(randvec));
+      srcU += 4;
+      randvec = vshl_n_u8(randvec, 2);
+      uint16x4_t vVal1 = vqadd_u16(vshl_n_u16(vld1_u16(srcV), 6),
+                                   vreinterpret_u16_u8(randvec));
+      srcV += 4;
+      randvec = vshl_n_u8(randvec, 2);
+      uint16x4_t uVal2 = vqadd_u16(vshl_n_u16(vld1_u16(srcU), 6),
+                                   vreinterpret_u16_u8(randvec));
+      srcU += 4;
+      randvec = vshl_n_u8(randvec, 2);
+      uint16x4_t vVal2 = vqadd_u16(vshl_n_u16(vld1_u16(srcV), 6),
+                                   vreinterpret_u16_u8(randvec));
+      srcV += 4;
+      vst1_u8(dstU,
+              vuzp_u8(vreinterpret_u8_u16(uVal1), vreinterpret_u8_u16(uVal2))
+                  .val[1]);
+      dstU += 8;
+      vst1_u8(dstV,
+              vuzp_u8(vreinterpret_u8_u16(vVal1), vreinterpret_u8_u16(vVal2))
+                  .val[1]);
+      dstV += 8;
+    }
+
+    i *= 4;
+    uint32_t randval = 0;
+    while (i < uvWidth) {
+      if (!randval) randval = random();
+      dstUBase[i] = (srcUBase[i] + (randval & 3)) >> 2;
+      randval >>= 2;
+      dstVBase[i] = (srcVBase[i] + (randval & 3)) >> 2;
+      randval >>= 2;
+      i++;
+    }
+
+    srcUBase += srcUVStride;
+    srcVBase += srcUVStride;
+    dstUBase += dstUVStride;
+    dstVBase += dstUVStride;
+  }
+
+  return 1;
+}
+
+#endif  // __ARM_NEON__
+
+static void convert_16_to_8_standard(const vpx_image_t* const img,
+                                     jbyte* const data, const int32_t uvHeight,
+                                     const int32_t yLength,
+                                     const int32_t uvLength) {
+  // Y
+  int sampleY = 0;
+  for (int y = 0; y < img->d_h; y++) {
+    const uint16_t* srcBase = reinterpret_cast<uint16_t*>(
+        img->planes[VPX_PLANE_Y] + img->stride[VPX_PLANE_Y] * y);
+    int8_t* destBase = data + img->stride[VPX_PLANE_Y] * y;
+    for (int x = 0; x < img->d_w; x++) {
+      // Lightweight dither. Carryover the remainder of each 10->8 bit
+      // conversion to the next pixel.
+      sampleY += *srcBase++;
+      *destBase++ = sampleY >> 2;
+      sampleY = sampleY & 3;  // Remainder.
+    }
+  }
+  // UV
+  int sampleU = 0;
+  int sampleV = 0;
+  const int32_t uvWidth = (img->d_w + 1) / 2;
+  for (int y = 0; y < uvHeight; y++) {
+    const uint16_t* srcUBase = reinterpret_cast<uint16_t*>(
+        img->planes[VPX_PLANE_U] + img->stride[VPX_PLANE_U] * y);
+    const uint16_t* srcVBase = reinterpret_cast<uint16_t*>(
+        img->planes[VPX_PLANE_V] + img->stride[VPX_PLANE_V] * y);
+    int8_t* destUBase = data + yLength + img->stride[VPX_PLANE_U] * y;
+    int8_t* destVBase =
+        data + yLength + uvLength + img->stride[VPX_PLANE_V] * y;
+    for (int x = 0; x < uvWidth; x++) {
+      // Lightweight dither. Carryover the remainder of each 10->8 bit
+      // conversion to the next pixel.
+      sampleU += *srcUBase++;
+      *destUBase++ = sampleU >> 2;
+      sampleU = sampleU & 3;  // Remainder.
+      sampleV += *srcVBase++;
+      *destVBase++ = sampleV >> 2;
+      sampleV = sampleV & 3;  // Remainder.
+    }
+  }
+}
+
 DECODER_FUNC(jlong, vpxInit) {
   vpx_codec_ctx_t* context = new vpx_codec_ctx_t();
   vpx_codec_dec_cfg_t cfg = {0, 0, 0};
@@ -201,47 +414,17 @@ DECODER_FUNC(jint, vpxGetFrame, jlong jContext, jobject jOutputBuffer) {
       // Note: The stride for BT2020 is twice of what we use so this is wasting
       // memory. The long term goal however is to upload half-float/short so
       // it's not important to optimize the stride at this time.
-      // Y
-      int sampleY = 0;
-      for (int y = 0; y < img->d_h; y++) {
-        const uint16_t* srcBase = reinterpret_cast<uint16_t*>(
-            img->planes[VPX_PLANE_Y] + img->stride[VPX_PLANE_Y] * y);
-        int8_t* destBase = data + img->stride[VPX_PLANE_Y] * y;
-        for (int x = 0; x < img->d_w; x++) {
-          // Lightweight dither. Carryover the remainder of each 10->8 bit
-          // conversion to the next pixel.
-          sampleY += *srcBase++;
-          *destBase++ = sampleY >> 2;
-          sampleY = sampleY & 3;  // Remainder.
-        }
-      }
-      // UV
-      int sampleU = 0;
-      int sampleV = 0;
-      const int32_t uvWidth = (img->d_w + 1) / 2;
-      for (int y = 0; y < uvHeight; y++) {
-        const uint16_t* srcUBase = reinterpret_cast<uint16_t*>(
-            img->planes[VPX_PLANE_U] + img->stride[VPX_PLANE_U] * y);
-        const uint16_t* srcVBase = reinterpret_cast<uint16_t*>(
-            img->planes[VPX_PLANE_V] + img->stride[VPX_PLANE_V] * y);
-        int8_t* destUBase = data + yLength + img->stride[VPX_PLANE_U] * y;
-        int8_t* destVBase = data + yLength + uvLength
-            + img->stride[VPX_PLANE_V] * y;
-        for (int x = 0; x < uvWidth; x++) {
-          // Lightweight dither. Carryover the remainder of each 10->8 bit
-          // conversion to the next pixel.
-          sampleU += *srcUBase++;
-          *destUBase++ = sampleU >> 2;
-          sampleU = sampleU & 3;  // Remainder.
-          sampleV += *srcVBase++;
-          *destVBase++ = sampleV >> 2;
-          sampleV = sampleV & 3;  // Remainder.
-        }
+      int converted = 0;
+#ifdef __ARM_NEON__
+      converted = convert_16_to_8_neon(img, data, uvHeight, yLength, uvLength);
+#endif  // __ARM_NEON__
+      if (!converted) {
+        convert_16_to_8_standard(img, data, uvHeight, yLength, uvLength);
       }
     } else {
-      // TODO: This copy can be eliminated by using external frame buffers. This
-      // is insignificant for smaller videos but takes ~1.5ms for 1080p clips.
-      // So this should eventually be gotten rid of.
+      // TODO: This copy can be eliminated by using external frame
+      // buffers. This is insignificant for smaller videos but takes ~1.5ms
+      // for 1080p clips. So this should eventually be gotten rid of.
       memcpy(data, img->planes[VPX_PLANE_Y], yLength);
       memcpy(data + yLength, img->planes[VPX_PLANE_U], uvLength);
       memcpy(data + yLength + uvLength, img->planes[VPX_PLANE_V], uvLength);
@@ -255,9 +438,7 @@ DECODER_FUNC(jstring, vpxGetErrorMessage, jlong jContext) {
   return env->NewStringUTF(vpx_codec_error(context));
 }
 
-DECODER_FUNC(jint, vpxGetErrorCode, jlong jContext) {
-  return errorCode;
-}
+DECODER_FUNC(jint, vpxGetErrorCode, jlong jContext) { return errorCode; }
 
 LIBRARY_FUNC(jstring, vpxIsSecureDecodeSupported) {
   // Doesn't support