Name	Description
SHUFFLE2(int, int)	Return a shuffle immediate suitable for use with _mm_shuffle_ps and similar instructions.
add_epi16(v128, v128)	Add packed 16-bit integers in "a" and "b", and store the results in "dst".
add_epi32(v128, v128)	Add packed 32-bit integers in "a" and "b", and store the results in "dst".
add_epi64(v128, v128)	Add packed 64-bit integers in "a" and "b", and store the results in "dst".
add_epi8(v128, v128)	Add packed 8-bit integers in "a" and "b", and store the results in "dst".
add_pd(v128, v128)	Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
add_sd(v128, v128)	Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
adds_epi16(v128, v128)	Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst".
adds_epi8(v128, v128)	Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst".
adds_epu16(v128, v128)	Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst".
adds_epu8(v128, v128)	Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst".
and_pd(v128, v128)	Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
and_si128(v128, v128)	Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".
andnot_pd(v128, v128)	Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst".
andnot_si128(v128, v128)	Compute the bitwise NOT of 128 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst".
avg_epu16(v128, v128)	Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst".
avg_epu8(v128, v128)	Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst".
bslli_si128(v128, int)	Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".
bsrli_si128(v128, int)	Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".
clflush(void*)	Invalidate and flush the cache line that contains p from all levels of the cache hierarchy.
cmpeq_epi16(v128, v128)	Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst".
cmpeq_epi32(v128, v128)	Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst".
cmpeq_epi8(v128, v128)	Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst".
cmpeq_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst".
cmpeq_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cmpge_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst".
cmpge_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cmpgt_epi16(v128, v128)	Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst".
cmpgt_epi32(v128, v128)	Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst".
cmpgt_epi8(v128, v128)	Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst".
cmpgt_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst".
cmpgt_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cmple_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst".
cmple_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cmplt_epi16(v128, v128)	Compare packed 16-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched.
cmplt_epi32(v128, v128)	Compare packed 32-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched.
cmplt_epi8(v128, v128)	Compare packed 8-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched.
cmplt_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst".
cmplt_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cmpneq_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst".
cmpneq_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cmpnge_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst".
cmpnge_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cmpngt_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst".
cmpngt_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cmpnle_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst".
cmpnle_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cmpnlt_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst".
cmpnlt_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cmpord_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst".
cmpord_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cmpunord_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst".
cmpunord_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
comieq_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1).
comige_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1).
comigt_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1).
comile_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1).
comilt_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1).
comineq_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1).
cvtepi32_pd(v128)	Convert packed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".
cvtepi32_ps(v128)	Convert packed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
cvtpd_epi32(v128)	Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".
cvtpd_ps(v128)	Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst".
cvtps_epi32(v128)	Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst".
cvtps_pd(v128)	Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst".
cvtsd_f64(v128)	Copy the lower double-precision (64-bit) floating-point element of "a" to "dst".
cvtsd_si32(v128)	Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst".
cvtsd_si64(v128)	Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
cvtsd_si64x(v128)	Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst".
cvtsd_ss(v128, v128)	Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cvtsi128_si32(v128)	Copy the lower 32-bit integer in "a" to "dst".
cvtsi128_si64(v128)	Copy the lower 64-bit integer in "a" to "dst".
cvtsi128_si64x(v128)	Copy the lower 64-bit integer in "a" to "dst".
cvtsi32_sd(v128, int)	Convert the 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cvtsi32_si128(int)	Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst".
cvtsi64_sd(v128, long)	Convert the 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cvtsi64_si128(long)	Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element.
cvtsi64x_sd(v128, long)	Convert the 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cvtsi64x_si128(long)	Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element.
cvtss_sd(v128, v128)	Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
cvttpd_epi32(v128)	Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".
cvttps_epi32(v128)	Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst".
cvttsd_si32(v128)	Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst".
cvttsd_si64(v128)	Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
cvttsd_si64x(v128)	Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst".
div_pd(v128, v128)	Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst".
div_sd(v128, v128)	Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
extract_epi16(v128, int)	Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst".
insert_epi16(v128, int, int)	Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8".
load_si128(void*)	Load 128-bits of integer data from memory into dst.
loadu_si128(void*)	Load 128-bits of integer data from memory into dst.
loadu_si32(void*)	Load unaligned 32-bit integer from memory into the first element of dst.
madd_epi16(v128, v128)	Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst".
max_epi16(v128, v128)	Compare packed 16-bit integers in "a" and "b", and store packed maximum values in "dst".
max_epu8(v128, v128)	Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst".
max_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst".
max_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
min_epi16(v128, v128)	Compare packed 16-bit integers in "a" and "b", and store packed minimum values in "dst".
min_epu8(v128, v128)	Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst".
min_pd(v128, v128)	Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst".
min_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
move_epi64(v128)	Copy the lower 64-bit integer in "a" to the lower element of "dst", and zero the upper element.
move_sd(v128, v128)	Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
movemask_epi8(v128)	Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst".
movemask_pd(v128)	Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a".
mul_epu32(v128, v128)	Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst".
mul_pd(v128, v128)	Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
mul_sd(v128, v128)	Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
mulhi_epi16(v128, v128)	Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".
mulhi_epu16(v128, v128)	Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst".
mullo_epi16(v128, v128)	Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst".
or_pd(v128, v128)	Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
or_si128(v128, v128)	Compute the bitwise OR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".
packs_epi16(v128, v128)	Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst".
packs_epi32(v128, v128)	Convert packed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst".
packus_epi16(v128, v128)	Convert packed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst".
sad_epu8(v128, v128)	Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst".
set1_epi16(short)	Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate "vpbroadcastw".
set1_epi32(int)	Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastd".
set1_epi64x(long)	Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq".
set1_epi8(sbyte)	Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastb".
set1_pd(double)	Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".
set_epi16(short, short, short, short, short, short, short, short)	Set packed 16-bit integers in "dst" with the supplied values.
set_epi32(int, int, int, int)	Set packed 32-bit integers in "dst" with the supplied values.
set_epi64x(long, long)	Set packed 64-bit integers in "dst" with the supplied values.
set_epi8(sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte)	Set packed 8-bit integers in "dst" with the supplied values in reverse order.
set_pd(double, double)	Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values.
set_pd1(double)	Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst".
set_sd(double)	Copy double-precision (64-bit) floating-point element "a" to the lower element of "dst", and zero the upper element.
setr_epi16(short, short, short, short, short, short, short, short)	Set packed 16-bit integers in "dst" with the supplied values in reverse order.
setr_epi32(int, int, int, int)	Set packed 32-bit integers in "dst" with the supplied values in reverse order.
setr_epi8(sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte, sbyte)	Set packed 8-bit integers in "dst" with the supplied values in reverse order.
setr_pd(double, double)	Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order.
setzero_si128()	Return vector of type __m128i with all elements set to zero.
shuffle_epi32(v128, int)	Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst".
shuffle_pd(v128, v128, int)	Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst".
shufflehi_epi16(v128, int)	Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst".
shufflelo_epi16(v128, int)	Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst".
sll_epi16(v128, v128)	Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".
sll_epi32(v128, v128)	Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".
sll_epi64(v128, v128)	Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst".
slli_epi16(v128, int)	Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".
slli_epi32(v128, int)	Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".
slli_epi64(v128, int)	Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst".
slli_si128(v128, int)	Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst".
sqrt_pd(v128)	Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".
sqrt_sd(v128, v128)	Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
sra_epi16(v128, v128)	Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".
sra_epi32(v128, v128)	Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst".
srai_epi16(v128, int)	Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".
srai_epi32(v128, int)	Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst".
srl_epi16(v128, v128)	Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".
srl_epi32(v128, v128)	Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".
srl_epi64(v128, v128)	Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst".
srli_epi16(v128, int)	Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".
srli_epi32(v128, int)	Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".
srli_epi64(v128, int)	Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst".
srli_si128(v128, int)	Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst".
store_si128(void*, v128)	Store 128-bits of integer data from a into memory.
storeu_si128(void*, v128)	Store 128-bits of integer data from a into memory.
storeu_si32(void*, v128)	Store 32-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.
stream_pd(void*, v128)	Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception will be generated.
stream_si128(void*, v128)	Store 128-bits of integer data from a into memory using a non-temporal memory hint.mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
stream_si32(int*, int)	Store 32-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated.
stream_si64(long*, long)	Store 64-bit integer a into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address mem_addr is already in the cache, the cache will be updated.
sub_epi16(v128, v128)	Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst".
sub_epi32(v128, v128)	Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst".
sub_epi64(v128, v128)	Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst".
sub_epi8(v128, v128)	Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst".
sub_pd(v128, v128)	Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst".
sub_sd(v128, v128)	Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst".
subs_epi16(v128, v128)	Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst".
subs_epi8(v128, v128)	Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst".
subs_epu16(v128, v128)	Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst".
subs_epu8(v128, v128)	Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst".
ucomieq_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
ucomige_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
ucomigt_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
ucomile_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
ucomilt_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
ucomineq_sd(v128, v128)	Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
unpackhi_epi16(v128, v128)	Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst".
unpackhi_epi32(v128, v128)	Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst".
unpackhi_epi64(v128, v128)	Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst".
unpackhi_epi8(v128, v128)	Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst".
unpackhi_pd(v128, v128)	Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst".
unpacklo_epi16(v128, v128)	Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst".
unpacklo_epi32(v128, v128)	Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst".
unpacklo_epi64(v128, v128)	Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst".
unpacklo_epi8(v128, v128)	Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst".
unpacklo_pd(v128, v128)	Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst".
xor_pd(v128, v128)	Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst".
xor_si128(v128, v128)	Compute the bitwise XOR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst".