part(“7 · Q1_0_g128 Quantization — What’s Taking place Below the Hood”)
print(textwrap.dedent(“””
╔══════════════════════════════════════════════════════════════╗
║ Bonsai Q1_0_g128 Weight Illustration ║
╠══════════════════════════════════════════════════════════════╣
║ Every weight = 1 bit: 0 → −scale ║
║ 1 → +scale ║
║ Each 128 weights share one FP16 scale issue. ║
║ ║
║ Efficient bits per weight: ║
║ 1 bit (signal) + 16/128 bits (shared scale) = 1.125 bpw ║
║ ║
║ Reminiscence comparability for Bonsai-1.7B: ║
║ FP16: 3.44 GB (1.0× baseline) ║
║ Q1_0_g128: 0.24 GB (14.2× smaller!) ║
║ MLX 1-bit g128: 0.27 GB (12.8× smaller) ║
╚══════════════════════════════════════════════════════════════╝
“””))
print(“📐 Python demo of Q1_0_g128 quantization logic:n”)
import random
random.seed(42)
GROUP_SIZE = 128
weights_fp16 = [random.gauss(0, 0.1) for _ in range(GROUP_SIZE)]
scale = max(abs(w) for w in weights_fp16)
quantized = [1 if w >= 0 else 0 for w in weights_fp16]
dequantized = [scale if b == 1 else -scale for b in quantized]
mse = sum((a – b) ** 2 for a, b in zip(weights_fp16, dequantized)) / GROUP_SIZE
print(f” FP16 weights (first 8): {[f'{w:.4f}’ for w in weights_fp16[:8]]}”)
print(f” 1-bit repr (first 8): {quantized[:8]}”)
print(f” Shared scale: {scale:.4f}”)
print(f” Dequantized (first 8): {[f'{w:.4f}’ for w in dequantized[:8]]}”)
print(f” MSE of reconstruction: {mse:.6f}”)
memory_fp16 = GROUP_SIZE * 2
memory_1bit = GROUP_SIZE / 8 + 2
print(f”n Reminiscence: FP16={memory_fp16}B vs Q1_0_g128={memory_1bit:.1f}B ”
f”({memory_fp16/memory_1bit:.1f}× discount)”)
part(“8 · Efficiency Benchmark — Tokens per Second”)
def benchmark(immediate, n_tokens=128, n_runs=3, **kw):
timings = []
for i in vary(n_runs):
print(f” Run {i+1}/{n_runs} …”, finish=” “, flush=True)
_, elapsed = infer(immediate, verbose=False, n_predict=n_tokens, **kw)
tps = n_tokens / elapsed
timings.append(tps)
print(f”{tps:.1f} tok/s”)
avg = sum(timings) / len(timings)
print(f”n ✅ Common: {avg:.1f} tok/s (over {n_runs} runs, {n_tokens} tokens every)”)
return avg
print(“📊 Benchmarking Bonsai-1.7B in your GPU …”)
tps = benchmark(
“Clarify the idea of neural community backpropagation step-by-step.”,
n_tokens=128, n_runs=3,
)
print(“n Revealed reference throughputs (from whitepaper):”)
print(” ┌──────────────────────┬─────────┬──────────────┐”)
print(” │ Platform │ Backend │ TG128 tok/s │”)
print(” ├──────────────────────┼─────────┼──────────────┤”)
print(” │ RTX 4090 │ CUDA │ 674 │”)
print(” │ M4 Professional 48 GB │ Metallic │ 250 │”)
print(f” │ Your GPU (measured) │ CUDA │ {tps:>7.1f} │”)
print(” └──────────────────────┴─────────┴──────────────┘”)
part(“9 · Multi-Flip Chat with Context Accumulation”)
def chat(user_msg, system=”You’re a useful assistant.”, historical past=None, **kw):
if historical past is None:
historical past = []
historical past.append((“consumer”, user_msg))
full = f”<|im_start|>systemn{system}<|im_end|>n”
for position, msg in historical past:
full += f”<|im_start|>{position}n{msg}<|im_end|>n”
full += “<|im_start|>assistantn”
secure = full.substitute(‘”‘, ‘”‘).substitute(‘n’, ‘n’)
cmd = (
f'{LLAMA_CLI} -m “{MODEL_PATH}”‘
f’ -p “{secure}” -e’
f’ -n 200 –temp 0.5 –top-p 0.85 –top-k 20′
f’ -ngl 99 -c 4096 –no-display-prompt’
)
end result = run(cmd, seize=True, examine=False)
reply = end result.stdout.strip()
historical past.append((“assistant”, reply))
return reply, historical past
print(“🗣 Beginning a 3-turn dialog about 1-bit fashions …n”)
historical past = []
turns = [
“What is a 1-bit language model?”,
“What are the main trade-offs compared to 4-bit or 8-bit quantization?”,
“How does Bonsai specifically address those trade-offs?”,
]
for i, msg in enumerate(turns, 1):
print(f”👤 Flip {i}: {msg}”)
reply, historical past = chat(msg, historical past=historical past)
print(f”🤖 Bonsai: {reply}n”)
time.sleep(0.5)
part(“10 · Sampling Parameter Exploration”)
creative_prompt = “Write a one-sentence description of a futuristic metropolis powered completely by 1-bit AI.”
configs = [
(“Precise / Focused”, dict(temp=0.1, top_k=10, top_p=0.70)),
(“Balanced (default)”, dict(temp=0.5, top_k=20, top_p=0.85)),
(“Creative / Varied”, dict(temp=0.9, top_k=50, top_p=0.95)),
(“High entropy”, dict(temp=1.2, top_k=100, top_p=0.98)),
]
print(f’Immediate: “{creative_prompt}”n’)
for label, params in configs:
out, _ = infer(creative_prompt, verbose=False, n_predict=80, **params)
print(f” [{label}]”)
print(f” temp={params[‘temp’]}, top_k={params[‘top_k’]}, top_p={params[‘top_p’]}”)
print(f” → {out[:200]}n”)

