Skip to content

feat: support StarCoder model architectures #3187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Sep 15, 2023
Merged
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0c5d4d8
add placeholder of starcoder in gguf / llama.cpp
wsxiaoys Sep 15, 2023
eb7f0eb
support convert starcoder weights to gguf
wsxiaoys Sep 15, 2023
76d32cc
convert MQA to MHA
wsxiaoys Sep 15, 2023
7e0a843
fix ffn_down name
wsxiaoys Sep 15, 2023
7298c37
add LLM_ARCH_STARCODER to llama.cpp
wsxiaoys Sep 15, 2023
166a259
set head_count_kv = 1
wsxiaoys Sep 15, 2023
57f064d
load starcoder weight
wsxiaoys Sep 15, 2023
a17ef39
add max_position_embeddings
wsxiaoys Sep 15, 2023
2683611
set n_positions to max_positioin_embeddings
wsxiaoys Sep 15, 2023
77c7ec1
properly load all starcoder params
wsxiaoys Sep 15, 2023
0be15e1
fix head count kv
wsxiaoys Sep 15, 2023
dac31da
fix comments
wsxiaoys Sep 15, 2023
4420cff
fix vram calculation for starcoder
wsxiaoys Sep 15, 2023
ab13d07
store mqa directly
wsxiaoys Sep 15, 2023
8bc76a2
add input embeddings handling
wsxiaoys Sep 15, 2023
101c578
add TBD
wsxiaoys Sep 15, 2023
a1cf66e
working in cpu, metal buggy
wsxiaoys Sep 15, 2023
6c353dc
cleanup useless code
wsxiaoys Sep 15, 2023
f82328a
metal : fix out-of-bounds access in soft_max kernels
ggerganov Sep 15, 2023
92a4f86
llama : make starcoder graph build more consistent with others
ggerganov Sep 15, 2023
caa7220
Merge pull request #2 from ggerganov/support-starcoder-fix
wsxiaoys Sep 15, 2023
57eaa39
refactor: cleanup comments a bit
wsxiaoys Sep 15, 2023
5ca037b
add other starcoder models: 3B, 7B, 15B
wsxiaoys Sep 15, 2023
08f35c4
support-mqa-directly
wsxiaoys Sep 15, 2023
e1fa9dd
Merge pull request #3 from TabbyML/support-starcoder-mqa
wsxiaoys Sep 15, 2023
f989ba1
fix: remove max_position_embeddings, use n_train_ctx
wsxiaoys Sep 15, 2023
bb9931c
Update llama.cpp
wsxiaoys Sep 15, 2023
eafcc34
Update llama.cpp
wsxiaoys Sep 15, 2023
e30ad71
Apply suggestions from code review
wsxiaoys Sep 15, 2023
72a7285
fix: switch to space from tab
wsxiaoys Sep 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
llama : make starcoder graph build more consistent with others
  • Loading branch information
ggerganov committed Sep 15, 2023
commit 92a4f8687933c9f0a6847097e1a5086f6cd44720
100 changes: 62 additions & 38 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3446,7 +3446,9 @@ static struct ggml_cgraph * llm_build_starcoder(
const int64_t n_layer = hparams.n_layer;
const int64_t n_ctx = hparams.n_ctx;
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();

GGML_ASSERT(n_embd_head == hparams.n_rot);

Expand Down Expand Up @@ -3508,28 +3510,44 @@ static struct ggml_cgraph * llm_build_starcoder(
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
}

struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(lctx.alloc, KQ_scale);
if (!ggml_allocr_is_measure(lctx.alloc)) {
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
}
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");

inpL = ggml_add(ctx0, token, position);
ggml_set_name(inpL, "inpL");

for (int il = 0; il < n_layer; ++il) {
{
// Norm
cur = ggml_norm(ctx0, inpL, norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);

}

{
// Self Attention
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);

struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);

// store key and value to memory
if (N >= 1) {
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * Qcur = tmpq;
struct ggml_tensor * Kcur = tmpk;

{
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
ggml_set_name(Vcur, "Vcur");

struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
ggml_set_name(k, "k");

struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));

ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
Expand All @@ -3541,56 +3559,62 @@ static struct ggml_cgraph * llm_build_starcoder(
Qcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
0, 2, 1, 3);
ggml_set_name(Q, "Q");

struct ggml_tensor * K =
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3); //TODO: need to be tiled
ggml_view_3d(ctx0, kv_self.k,
n_embd_head, n_past + N, n_head_kv,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
ggml_set_name(K, "K");

// K * Q
// [n_past + N, N, 12]
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
ggml_set_name(KQ, "KQ");

// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_scaled =
ggml_scale_inplace(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);
// KQ_scaled = KQ / sqrt(n_embd_head)
// KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
ggml_set_name(KQ_scaled, "KQ_scaled");

// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
ggml_set_name(KQ_masked, "KQ_masked");

// KQ = soft_max(KQ_masked)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
ggml_set_name(KQ_soft_max, "KQ_soft_max");

// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12]
struct ggml_tensor * V_trans =
ggml_cpy(ctx0,
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));

// KQV = transpose(V) * KQ_soft_max
// [64, N, 12]
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
n_past + N, n_embd_head, n_head_kv,
ggml_element_size(kv_self.v)*n_ctx,
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
ggml_set_name(V, "V");

#if 1
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
ggml_set_name(KQV, "KQV");
#else
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
// is there a better way?
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
#endif

// KQV_merged = KQV.permute(0, 2, 1, 3)
// [64, 12, N]
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
ggml_set_name(KQV_merged, "KQV_merged");

// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ggml_cpy(ctx0,
KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
ggml_set_name(cur, "KQV_merged_contiguous");
}

// Projection
Expand Down