CivArchive

    Myopia_Minton_-_The_Looney_Tunes_Show-000011.safetensors - CivArchive

    GPT2-MYOPIC-ATTNONLY_global_step=9099.0_val_loss=3.31.ckpt
    GPT2-MYOPIC-ATTNONLY_global_step=9099.0_val_loss=3.31.ckpt
    by wiwu23900
    GPT2-MYOPIC-CUTGRAD_val_myopic_loss=3.73.ckpt
    GPT2-MYOPIC-CUTGRAD_val_myopic_loss=3.73.ckpt
    by wiwu23900
    GPT2-MYOPIC-FULL-TOKLOSS_val_myopic_loss=3.19.ckpt
    GPT2-MYOPIC-FULL-TOKLOSS_val_myopic_loss=3.19.ckpt
    by wiwu23900
    GPT2-MYOPIC-H-CUTGRAD_val_myopic_loss=3.40.ckpt
    GPT2-MYOPIC-H-CUTGRAD_val_myopic_loss=3.40.ckpt
    by wiwu23900
    GPT2-MYOPIC-H-FROMORIG_val_myopic_loss=3.26.ckpt
    GPT2-MYOPIC-H-FROMORIG_val_myopic_loss=3.26.ckpt
    by wiwu23900
    GPT2-MYOPIC-NO-ATTN-val_kl_loss=0.42.ckpt
    GPT2-MYOPIC-NO-ATTN-val_kl_loss=0.42.ckpt
    by wiwu23900
    GPT2-MYOPIC-NO-ORIG_val_myopic_loss=5.34.ckpt
    GPT2-MYOPIC-NO-ORIG_val_myopic_loss=5.34.ckpt
    by wiwu23900
    GPT2-MYOPIC-val_kl_loss-0.247.ckpt
    GPT2-MYOPIC-val_kl_loss-0.247.ckpt
    by wiwu23900
    ARITH_GPT2_MYOPIC_MAX5_PAD10_REVERSE_RANDINIT_global_step=8783.0_train_loss=0.07.ckpt
    ARITH_GPT2_MYOPIC_MAX5_PAD10_REVERSE_RANDINIT_global_step=8783.0_train_loss=0.07.ckpt
    by wiwu23900
    ARITH_GPT2_MYOPIC_MAX5_PAD5_REVERSE_RANDINIT_global_step=8783.0_train_loss=0.04.ckpt
    ARITH_GPT2_MYOPIC_MAX5_PAD5_REVERSE_RANDINIT_global_step=8783.0_train_loss=0.04.ckpt
    by wiwu23900
    ARITH_GPT2_MYOPIC_MAX8_REVERSE_RANDINIT_global_step=19531.0_train_loss=0.79.ckpt
    ARITH_GPT2_MYOPIC_MAX8_REVERSE_RANDINIT_global_step=19531.0_train_loss=0.79.ckpt
    by wiwu23900
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-1.4b_lr_8.00e-05_warmup_5.00e-02_global_step=1627.0_train_loss=2.71.ckpt
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-1.4b_lr_8.00e-05_warmup_5.00e-02_global_step=1627.0_train_loss=2.71.ckpt
    by wiwu23900
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-14m_lr_4.00e-04_warmup_5.00e-02_global_step=1627.0_train_loss=4.27.ckpt
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-14m_lr_4.00e-04_warmup_5.00e-02_global_step=1627.0_train_loss=4.27.ckpt
    by wiwu23900
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-160m_lr_2.40e-04_warmup_5.00e-02_global_step=1627.0_train_loss=3.13.ckpt
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-160m_lr_2.40e-04_warmup_5.00e-02_global_step=1627.0_train_loss=3.13.ckpt
    by wiwu23900
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-1b_lr_1.20e-04_warmup_5.00e-02_global_step=1627.0_train_loss=2.78.ckpt
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-1b_lr_1.20e-04_warmup_5.00e-02_global_step=1627.0_train_loss=2.78.ckpt
    by wiwu23900
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-2.8b_lr_6.40e-05_warmup_5.00e-02_global_step=1627.0_train_loss=2.67.ckpt
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-2.8b_lr_6.40e-05_warmup_5.00e-02_global_step=1627.0_train_loss=2.67.ckpt
    by wiwu23900
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-31m_lr_4.00e-04_warmup_5.00e-02_global_step=1627.0_train_loss=3.84.ckpt
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-31m_lr_4.00e-04_warmup_5.00e-02_global_step=1627.0_train_loss=3.84.ckpt
    by wiwu23900
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-410m_lr_1.20e-04_warmup_5.00e-02_global_step=1627.0_train_loss=2.89.ckpt
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-410m_lr_1.20e-04_warmup_5.00e-02_global_step=1627.0_train_loss=2.89.ckpt
    by wiwu23900
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-70m_lr_1.20e-04_warmup_5.00e-02_global_step=1627.0_train_loss=3.48.ckpt
    PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-70m_lr_1.20e-04_warmup_5.00e-02_global_step=1627.0_train_loss=3.48.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_BETA0.0_conv-10_shift-1_p-0.01_lr-0.001_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=58593.0_train_loss=0.01.ckpt
    SYNTH-GPT2-MYOPIC-COS_BETA0.0_conv-10_shift-1_p-0.01_lr-0.001_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=58593.0_train_loss=0.01.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_BETA0.0_conv-10_shift-1_p-0.1_lr-0.001_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=58593.0_train_loss=0.10.ckpt
    SYNTH-GPT2-MYOPIC-COS_BETA0.0_conv-10_shift-1_p-0.1_lr-0.001_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=58593.0_train_loss=0.10.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_BETA0.0_conv-10_shift-1_p-0.3_lr-0.001_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=58593.0_train_loss=0.32.ckpt
    SYNTH-GPT2-MYOPIC-COS_BETA0.0_conv-10_shift-1_p-0.3_lr-0.001_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=58593.0_train_loss=0.32.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_BETA0.1_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=1.23.ckpt
    SYNTH-GPT2-MYOPIC-COS_BETA0.1_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=1.23.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_BETA0.3_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=1.18.ckpt
    SYNTH-GPT2-MYOPIC-COS_BETA0.3_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=1.18.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_BETA0.4_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=1.22.ckpt
    SYNTH-GPT2-MYOPIC-COS_BETA0.4_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=1.22.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_BETA0.5_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=0.00.ckpt
    SYNTH-GPT2-MYOPIC-COS_BETA0.5_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=0.00.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_BETA0_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=1.24.ckpt
    SYNTH-GPT2-MYOPIC-COS_BETA0_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=1.24.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_conv-10_shift-0_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=58593.0_train_loss=0.10.ckpt
    SYNTH-GPT2-MYOPIC-COS_conv-10_shift-0_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=58593.0_train_loss=0.10.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=58593.0_train_loss=1.24.ckpt
    SYNTH-GPT2-MYOPIC-COS_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=58593.0_train_loss=1.24.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=1.27.ckpt
    SYNTH-GPT2-MYOPIC-COS_conv-10_shift-1_p-1_lr-0.001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=58593.0_train_loss=1.27.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-COS_conv-10_shift-1_p-1_lr-0.001_n_embd-256_n_head-4_n_layer-4_activation_function-relu_global_step=58593.0_train_loss=1.24.ckpt
    SYNTH-GPT2-MYOPIC-COS_conv-10_shift-1_p-1_lr-0.001_n_embd-256_n_head-4_n_layer-4_activation_function-relu_global_step=58593.0_train_loss=1.24.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-L_conv-10_shift-0_lr-0.0001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=7815.0_val_loss=1.18.ckpt
    SYNTH-GPT2-MYOPIC-L_conv-10_shift-0_lr-0.0001_n_embd-128_n_head-2_n_layer-3_activation_function-relu_global_step=7815.0_val_loss=1.18.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-L_conv-10_shift-0_lr-0.0005_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=13677.0_val_loss=0.47.ckpt
    SYNTH-GPT2-MYOPIC-L_conv-10_shift-0_lr-0.0005_n_embd-128_n_head-2_n_layer-2_activation_function-relu_global_step=13677.0_val_loss=0.47.ckpt
    by wiwu23900
    SYNTH-GPT2-MYOPIC-L_conv-10_shift-0_lr-0.001_n_embd-128_n_head-2_n_layer-4_activation_function-relu_global_step=3907.0_val_loss=1.19.ckpt
    SYNTH-GPT2-MYOPIC-L_conv-10_shift-0_lr-0.001_n_embd-128_n_head-2_n_layer-4_activation_function-relu_global_step=3907.0_val_loss=1.19.ckpt
    by wiwu23900