Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ggml/src/ggml-cpu/arch-fallback.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// quants.c
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
Expand Down
79 changes: 79 additions & 0 deletions ggml/src/ggml-cpu/arch/x86/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,85 @@ static inline __m128i get_scale_shuffle(int i) {
}
#endif

void ggml_vec_dot_q2_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK2_0;
const int nb = n / qk;

assert(n % qk == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);

const block_q2_0 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;

float sumf = 0.0f;

// group 64: one Q2_0 block (64 weights) maps to two Q8_0 blocks (2 * 32 = 64)
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
// AVX-512-VNNI: unpack 2-bit codes c in {0,1,2,3} (value = c-1), then
// dot((c-1), qy) = dpbusd(c, qy) - dpbusd(1, qy).
const __m256i ones = _mm256_set1_epi8(1);
const __m128i idxlo = _mm_setr_epi8(0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3);
const __m128i idxhi = _mm_setr_epi8(4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7);
const __m256i mul = _mm256_setr_epi16(64,16,4,1, 64,16,4,1, 64,16,4,1, 64,16,4,1); // <<(6-2c)
const __m256i three = _mm256_set1_epi16(3);
for (int i = 0; i < nb; i++) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
float sumi = 0.0f;
for (int k = 0; k < 2; k++) {
const block_q8_0 * GGML_RESTRICT yb = &y[i * 2 + k];
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
const __m256i qy = _mm256_loadu_si256((const __m256i *) yb->qs);
const __m128i src = _mm_loadl_epi64((const __m128i *) &x[i].qs[k * 8]); // 8 bytes
// replicate each byte 4x, then extract field c via (b<<(6-2c))>>6 & 3
const __m256i rep = _mm256_set_m128i(_mm_shuffle_epi8(src, idxhi), _mm_shuffle_epi8(src, idxlo));
__m256i r0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(rep));
__m256i r1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(rep, 1));
r0 = _mm256_and_si256(_mm256_srli_epi16(_mm256_mullo_epi16(r0, mul), 6), three);
r1 = _mm256_and_si256(_mm256_srli_epi16(_mm256_mullo_epi16(r1, mul), 6), three);
__m256i codes = _mm256_permute4x64_epi64(_mm256_packus_epi16(r0, r1), 0xD8); // 32 codes in order
const int dp = hsum_i32_8(_mm256_dpbusd_epi32(_mm256_setzero_si256(), codes, qy));
const int sy = hsum_i32_8(_mm256_dpbusd_epi32(_mm256_setzero_si256(), ones, qy));
sumi += d1 * (float)(dp - sy);
}
sumf += d0 * sumi;
}
#else
for (int i = 0; i < nb; i++) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);

float sumi = 0.0f;
Comment on lines +601 to +605

for (int k = 0; k < 2; k++) {
const block_q8_0 * GGML_RESTRICT yb = &y[i * 2 + k];
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
int sumi_block = 0;

const uint8_t * GGML_RESTRICT qs = &x[i].qs[k * 8];
const int8_t * GGML_RESTRICT qy = yb->qs;

for (int b = 0; b < 8; ++b) {
const uint8_t byte = qs[b];
// Extract 4 two-bit values, map {0,1,2,3} -> {-1,0,1,2}
sumi_block += ((int)((byte >> 0) & 3) - 1) * qy[b*4 + 0];
sumi_block += ((int)((byte >> 2) & 3) - 1) * qy[b*4 + 1];
sumi_block += ((int)((byte >> 4) & 3) - 1) * qy[b*4 + 2];
sumi_block += ((int)((byte >> 6) & 3) - 1) * qy[b*4 + 3];
}

sumi += d1 * sumi_block;
}

sumf += d0 * sumi;
}
#endif

*s = sumf;
}

void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK1_0;
const int nb = n / qk;
Expand Down