3 files changed, 472 insertions, 0 deletions
diff --git a/libgrust/rustc-lib/stdarch/examples/Cargo.toml b/libgrust/rustc-lib/stdarch/examples/Cargo.toml
new file mode 100644
index 0000000..72599b4
--- /dev/null
+++ b/libgrust/rustc-lib/stdarch/examples/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "stdarch_examples"
+version = "0.0.0"
+authors = [
+    "Alex Crichton <alex@alexcrichton.com>",
+    "Andrew Gallant <jamslam@gmail.com>",
+    "Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>",
+]
+description = "Examples of the stdarch crate."
+edition = "2018"
+
+[dependencies]
+core_arch = { path = "../crates/core_arch" }
+std_detect = { path = "../crates/std_detect" }
+quickcheck = "0.9"
+rand = "0.7"
+
+[[bin]]
+name = "hex"
+path = "hex.rs"
+
+[[example]]
+name = "wasm"
+crate-type = ["cdylib"]
+path = "wasm.rs"
diff --git a/libgrust/rustc-lib/stdarch/examples/hex.rs b/libgrust/rustc-lib/stdarch/examples/hex.rs
new file mode 100644
index 0000000..4382698
--- /dev/null
+++ b/libgrust/rustc-lib/stdarch/examples/hex.rs
@@ -0,0 +1,402 @@
+//! An example showing runtime dispatch to an architecture-optimized
+//! implementation.
+//!
+//! This program implements hex encoding a slice into a predetermined
+//! destination using various different instruction sets. This selects at
+//! runtime the most optimized implementation and uses that rather than being
+//! required to be compiled differently.
+//!
+//! You can test out this program via:
+//!
+//!     echo test | cargo +nightly run --release hex
+//!
+//! and you should see `746573740a` get printed out.
+
+#![feature(stdsimd, wasm_target_feature)]
+#![cfg_attr(test, feature(test))]
+#![cfg_attr(target_arch = "wasm32", feature(wasm_simd))]
+#![allow(
+    clippy::result_unwrap_used,
+    clippy::print_stdout,
+    clippy::option_unwrap_used,
+    clippy::shadow_reuse,
+    clippy::cast_possible_wrap,
+    clippy::cast_ptr_alignment,
+    clippy::cast_sign_loss,
+    clippy::missing_docs_in_private_items
+)]
+
+use std::{
+    io::{self, Read},
+    str,
+};
+
+#[cfg(target_arch = "x86")]
+use {core_arch::arch::x86::*, std_detect::is_x86_feature_detected};
+#[cfg(target_arch = "x86_64")]
+use {core_arch::arch::x86_64::*, std_detect::is_x86_feature_detected};
+
+fn main() {
+    let mut input = Vec::new();
+    io::stdin().read_to_end(&mut input).unwrap();
+    let mut dst = vec![0; 2 * input.len()];
+    let s = hex_encode(&input, &mut dst).unwrap();
+    println!("{}", s);
+}
+
+fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    let len = src.len().checked_mul(2).unwrap();
+    if dst.len() < len {
+        return Err(len);
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("avx2") {
+            return unsafe { hex_encode_avx2(src, dst) };
+        }
+        if is_x86_feature_detected!("sse4.1") {
+            return unsafe { hex_encode_sse41(src, dst) };
+        }
+    }
+    #[cfg(target_arch = "wasm32")]
+    {
+        if true {
+            return unsafe { hex_encode_simd128(src, dst) };
+        }
+    }
+
+    hex_encode_fallback(src, dst)
+}
+
+#[target_feature(enable = "avx2")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn hex_encode_avx2<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    let ascii_zero = _mm256_set1_epi8(b'0' as i8);
+    let nines = _mm256_set1_epi8(9);
+    let ascii_a = _mm256_set1_epi8((b'a' - 9 - 1) as i8);
+    let and4bits = _mm256_set1_epi8(0xf);
+
+    let mut i = 0_isize;
+    while src.len() >= 32 {
+        let invec = _mm256_loadu_si256(src.as_ptr() as *const _);
+
+        let masked1 = _mm256_and_si256(invec, and4bits);
+        let masked2 = _mm256_and_si256(_mm256_srli_epi64(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = _mm256_cmpgt_epi8(masked1, nines);
+        let cmpmask2 = _mm256_cmpgt_epi8(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = _mm256_add_epi8(masked1, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask1));
+        let masked2 = _mm256_add_epi8(masked2, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask2));
+
+        // interleave masked1 and masked2 bytes
+        let res1 = _mm256_unpacklo_epi8(masked2, masked1);
+        let res2 = _mm256_unpackhi_epi8(masked2, masked1);
+
+        // Store everything into the right destination now
+        let base = dst.as_mut_ptr().offset(i * 2);
+        let base1 = base.offset(0) as *mut _;
+        let base2 = base.offset(16) as *mut _;
+        let base3 = base.offset(32) as *mut _;
+        let base4 = base.offset(48) as *mut _;
+        _mm256_storeu2_m128i(base3, base1, res1);
+        _mm256_storeu2_m128i(base4, base2, res2);
+        src = &src[32..];
+        i += 32;
+    }
+
+    let i = i as usize;
+    let _ = hex_encode_sse41(src, &mut dst[i * 2..]);
+
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
+}
+
+// copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp
+#[target_feature(enable = "sse4.1")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    let ascii_zero = _mm_set1_epi8(b'0' as i8);
+    let nines = _mm_set1_epi8(9);
+    let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
+    let and4bits = _mm_set1_epi8(0xf);
+
+    let mut i = 0_isize;
+    while src.len() >= 16 {
+        let invec = _mm_loadu_si128(src.as_ptr() as *const _);
+
+        let masked1 = _mm_and_si128(invec, and4bits);
+        let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
+        let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = _mm_add_epi8(masked1, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1));
+        let masked2 = _mm_add_epi8(masked2, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2));
+
+        // interleave masked1 and masked2 bytes
+        let res1 = _mm_unpacklo_epi8(masked2, masked1);
+        let res2 = _mm_unpackhi_epi8(masked2, masked1);
+
+        _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
+        _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2);
+        src = &src[16..];
+        i += 16;
+    }
+
+    let i = i as usize;
+    let _ = hex_encode_fallback(src, &mut dst[i * 2..]);
+
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
+}
+
+#[cfg(target_arch = "wasm32")]
+#[target_feature(enable = "simd128")]
+unsafe fn hex_encode_simd128<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    use core_arch::arch::wasm32::*;
+
+    let ascii_zero = i8x16_splat(b'0' as i8);
+    let nines = i8x16_splat(9);
+    let ascii_a = i8x16_splat((b'a' - 9 - 1) as i8);
+    let and4bits = i8x16_splat(0xf);
+
+    let mut i = 0_isize;
+    while src.len() >= 16 {
+        let invec = v128_load(src.as_ptr() as *const _);
+
+        let masked1 = v128_and(invec, and4bits);
+        let masked2 = v128_and(i8x16_shr_u(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = i8x16_gt_u(masked1, nines);
+        let cmpmask2 = i8x16_gt_u(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = i8x16_add(masked1, v128_bitselect(ascii_a, ascii_zero, cmpmask1));
+        let masked2 = i8x16_add(masked2, v128_bitselect(ascii_a, ascii_zero, cmpmask2));
+
+        // Next we need to shuffle around masked{1,2} to get back to the
+        // original source text order. The first element (res1) we'll store uses
+        // all the low bytes from the 2 masks and the second element (res2) uses
+        // all the upper bytes.
+        let res1 = v8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+            masked2, masked1,
+        );
+        let res2 = v8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+            masked2, masked1,
+        );
+
+        v128_store(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
+        v128_store(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2);
+        src = &src[16..];
+        i += 16;
+    }
+
+    let i = i as usize;
+    let _ = hex_encode_fallback(src, &mut dst[i * 2..]);
+
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
+}
+
+fn hex_encode_fallback<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    fn hex(byte: u8) -> u8 {
+        static TABLE: &[u8] = b"0123456789abcdef";
+        TABLE[byte as usize]
+    }
+
+    for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) {
+        slots[0] = hex((*byte >> 4) & 0xf);
+        slots[1] = hex(*byte & 0xf);
+    }
+
+    unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2])) }
+}
+
+// Run these with `cargo +nightly test --example hex -p stdarch`
+#[cfg(test)]
+mod tests {
+    use std::iter;
+
+    use super::*;
+
+    fn test(input: &[u8], output: &str) {
+        let tmp = || vec![0; input.len() * 2];
+
+        assert_eq!(hex_encode_fallback(input, &mut tmp()).unwrap(), output);
+        assert_eq!(hex_encode(input, &mut tmp()).unwrap(), output);
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        unsafe {
+            if self::is_x86_feature_detected!("avx2") {
+                assert_eq!(hex_encode_avx2(input, &mut tmp()).unwrap(), output);
+            }
+            if self::is_x86_feature_detected!("sse4.1") {
+                assert_eq!(hex_encode_sse41(input, &mut tmp()).unwrap(), output);
+            }
+        }
+    }
+
+    #[test]
+    fn empty() {
+        test(b"", "");
+    }
+
+    #[test]
+    fn big() {
+        test(
+            &[0; 1024],
+            &iter::repeat('0').take(2048).collect::<String>(),
+        );
+    }
+
+    #[test]
+    fn odd() {
+        test(
+            &[0; 313],
+            &iter::repeat('0').take(313 * 2).collect::<String>(),
+        );
+    }
+
+    #[test]
+    fn avx_works() {
+        let mut input = [0; 33];
+        input[4] = 3;
+        input[16] = 3;
+        input[17] = 0x30;
+        input[21] = 1;
+        input[31] = 0x24;
+        test(
+            &input,
+            "\
+             0000000003000000\
+             0000000000000000\
+             0330000000010000\
+             0000000000000024\
+             00\
+             ",
+        );
+    }
+
+    quickcheck::quickcheck! {
+        fn encode_equals_fallback(input: Vec<u8>) -> bool {
+            let mut space1 = vec![0; input.len() * 2];
+            let mut space2 = vec![0; input.len() * 2];
+            let a = hex_encode(&input, &mut space1).unwrap();
+            let b = hex_encode_fallback(&input, &mut space2).unwrap();
+            a == b
+        }
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        fn avx_equals_fallback(input: Vec<u8>) -> bool {
+            if !self::is_x86_feature_detected!("avx2") {
+                return true
+            }
+            let mut space1 = vec![0; input.len() * 2];
+            let mut space2 = vec![0; input.len() * 2];
+            let a = unsafe { hex_encode_avx2(&input, &mut space1).unwrap() };
+            let b = hex_encode_fallback(&input, &mut space2).unwrap();
+            a == b
+        }
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        fn sse41_equals_fallback(input: Vec<u8>) -> bool {
+            if !self::is_x86_feature_detected!("avx2") {
+                return true
+            }
+            let mut space1 = vec![0; input.len() * 2];
+            let mut space2 = vec![0; input.len() * 2];
+            let a = unsafe { hex_encode_sse41(&input, &mut space1).unwrap() };
+            let b = hex_encode_fallback(&input, &mut space2).unwrap();
+            a == b
+        }
+    }
+}
+
+// Run these with `cargo +nightly bench --example hex -p stdarch`
+#[cfg(test)]
+mod benches {
+    extern crate rand;
+    extern crate test;
+
+    use self::rand::Rng;
+
+    use super::*;
+
+    const SMALL_LEN: usize = 117;
+    const LARGE_LEN: usize = 1 * 1024 * 1024;
+
+    fn doit(
+        b: &mut test::Bencher,
+        len: usize,
+        f: for<'a> unsafe fn(&[u8], &'a mut [u8]) -> Result<&'a str, usize>,
+    ) {
+        let mut rng = rand::thread_rng();
+        let input = std::iter::repeat(())
+            .map(|()| rng.gen::<u8>())
+            .take(len)
+            .collect::<Vec<_>>();
+        let mut dst = vec![0; input.len() * 2];
+        b.bytes = len as u64;
+        b.iter(|| unsafe {
+            f(&input, &mut dst).unwrap();
+            dst[0]
+        });
+    }
+
+    #[bench]
+    fn small_default(b: &mut test::Bencher) {
+        doit(b, SMALL_LEN, hex_encode);
+    }
+
+    #[bench]
+    fn small_fallback(b: &mut test::Bencher) {
+        doit(b, SMALL_LEN, hex_encode_fallback);
+    }
+
+    #[bench]
+    fn large_default(b: &mut test::Bencher) {
+        doit(b, LARGE_LEN, hex_encode);
+    }
+
+    #[bench]
+    fn large_fallback(b: &mut test::Bencher) {
+        doit(b, LARGE_LEN, hex_encode_fallback);
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    mod x86 {
+        use super::*;
+
+        #[bench]
+        fn small_avx2(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("avx2") {
+                doit(b, SMALL_LEN, hex_encode_avx2);
+            }
+        }
+
+        #[bench]
+        fn small_sse41(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("sse4.1") {
+                doit(b, SMALL_LEN, hex_encode_sse41);
+            }
+        }
+
+        #[bench]
+        fn large_avx2(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("avx2") {
+                doit(b, LARGE_LEN, hex_encode_avx2);
+            }
+        }
+
+        #[bench]
+        fn large_sse41(b: &mut test::Bencher) {
+            if self::is_x86_feature_detected!("sse4.1") {
+                doit(b, LARGE_LEN, hex_encode_sse41);
+            }
+        }
+    }
+}
diff --git a/libgrust/rustc-lib/stdarch/examples/wasm.rs b/libgrust/rustc-lib/stdarch/examples/wasm.rs
new file mode 100644
index 0000000..6b92ae9
--- /dev/null
+++ b/libgrust/rustc-lib/stdarch/examples/wasm.rs
@@ -0,0 +1,45 @@
+//! A simple slab allocator for pages in wasm
+
+#![feature(stdsimd)]
+#![cfg(target_arch = "wasm32")]
+
+use std::ptr;
+
+use core_arch::arch::wasm32::*;
+
+static mut HEAD: *mut *mut u8 = 0 as _;
+
+#[no_mangle]
+pub unsafe extern "C" fn page_alloc() -> *mut u8 {
+    if !HEAD.is_null() {
+        let next = *HEAD;
+        let ret = HEAD;
+        HEAD = next as *mut _;
+        return ret as *mut u8;
+    }
+
+    let ret = memory_grow(0, 1);
+
+    // if we failed to allocate a page then return null
+    if ret == usize::MAX {
+        return ptr::null_mut();
+    }
+
+    ((ret as u32) * page_size()) as *mut u8
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn page_free(page: *mut u8) {
+    let page = page as *mut *mut u8;
+    *page = HEAD as *mut u8;
+    HEAD = page;
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn memory_used() -> usize {
+    (page_size() * (memory_size(0) as u32)) as usize
+}
+
+fn page_size() -> u32 {
+    64 * 1024
+}