{"id":782,"date":"2026-05-31T15:26:25","date_gmt":"2026-05-31T15:26:25","guid":{"rendered":"https:\/\/convly.ai\/llm-vram-calculator\/"},"modified":"2026-07-12T10:38:21","modified_gmt":"2026-07-12T10:38:21","slug":"llm-vram-calculator","status":"publish","type":"page","link":"https:\/\/convly.ai\/ar\/llm-vram-calculator\/","title":{"rendered":"\u0647\u0644 \u064a\u0645\u0643\u0646\u0646\u064a \u062a\u0634\u063a\u064a\u0644 \u0647\u0630\u0627 \u0627\u0644\u0646\u0645\u0648\u0630\u062c \u0627\u0644\u0644\u063a\u0648\u064a \u0627\u0644\u0643\u0628\u064a\u0631 \u0645\u062d\u0644\u064a\u064b\u0651\u0627\u061f \u2014 \u0622\u0644\u0629 \u062d\u0627\u0633\u0628\u0629 \u0645\u062c\u0627\u0646\u064a\u0629 \u0644\u062a\u0642\u062f\u064a\u0631 \u0627\u0644\u0640 VRAM \u0648\u0648\u062d\u062f\u0627\u062a \u0645\u0639\u0627\u0644\u062c\u0629 \u0627\u0644\u0631\u0633\u0648\u0645\u0627\u062a"},"content":{"rendered":"<p>Wondering whether your GPU can run a given large language model locally? This free calculator estimates the VRAM each model needs at every quantization level \u2014 and tells you exactly which ones fit your card.<\/p>\n<div class=\"llmc\" id=\"llmc\">\n  <div class=\"llmc-controls\">\n    <label>Model\n      <select id=\"llmc-model\">\n        <option value=\"\">\u2014 Custom (enter parameters) \u2014<\/option>\n        <option value=\"671\">DeepSeek R1 (671B)<\/option><option value=\"70\">DeepSeek R1 Distill Llama 70B (70B)<\/option><option value=\"284\">DeepSeek V4-Flash (284B)<\/option><option value=\"1600\">DeepSeek V4-Pro (1.6T)<\/option><option value=\"12\">Gemma 3 12B (12B)<\/option><option value=\"27\">Gemma 3 27B (27B)<\/option><option value=\"4\">Gemma 3 4B (4B)<\/option><option value=\"744\">GLM 5.2 (744B)<\/option><option value=\"1000\">Kimi K2.7 Code (1T)<\/option><option value=\"2800\">Kimi K3 (2.8T)<\/option><option value=\"8\">Llama 3.1 8B (8B)<\/option><option value=\"70\">Llama 3.3 70B (70B)<\/option><option value=\"400\">Llama 4 Maverick (400B)<\/option><option value=\"109\">Llama 4 Scout (109B)<\/option><option value=\"7\">Mistral 7B (7B)<\/option><option value=\"675\">Mistral Large 3 (675B)<\/option><option value=\"12\">Mistral NeMo 12B (12B)<\/option><option value=\"30\">NVIDIA Nemotron 3 Nano Omni (30B)<\/option><option value=\"14\">Phi-4 (14B)<\/option><option value=\"14\">Qwen3 14B (14B)<\/option><option value=\"235\">Qwen3 235B-A22B (235B)<\/option><option value=\"30\">Qwen3 30B-A3B (30B)<\/option><option value=\"32\">Qwen3 32B (32B)<\/option><option value=\"8\">Qwen3 8B (8B)<\/option>      <\/select>\n    <\/label>\n    <label id=\"llmc-custom-wrap\">Parameters (billions)\n      <input type=\"number\" id=\"llmc-params\" value=\"8\" min=\"0.1\" step=\"0.1\" \/>\n    <\/label>\n    <label>Your GPU\n      <select id=\"llmc-gpu\">\n        <option value=\"8\">8 GB (RTX 4060 \/ 3060 Ti)<\/option>\n        <option value=\"12\">12 GB (RTX 4070 \/ 3060)<\/option>\n        <option value=\"16\">16 GB (RTX 4060 Ti 16GB \/ 4080)<\/option>\n        <option value=\"24\" selected>24 GB (RTX 4090 \/ 3090)<\/option>\n        <option value=\"32\">32 GB (RTX 5090)<\/option>\n        <option value=\"48\">48 GB (RTX 6000 Ada \/ A6000)<\/option>\n        <option value=\"80\">80 GB (A100 \/ H100)<\/option>\n        <option value=\"160\">160 GB (2\u00d7 H100)<\/option>\n        <option value=\"custom\">Custom\u2026<\/option>\n      <\/select>\n    <\/label>\n    <label id=\"llmc-gpu-custom-wrap\" style=\"display:none\">VRAM (GB)\n      <input type=\"number\" id=\"llmc-vram\" value=\"24\" min=\"1\" step=\"1\" \/>\n    <\/label>\n    <label>Context\n      <select id=\"llmc-ctx\">\n        <option value=\"4\">4K<\/option>\n        <option value=\"8\" selected>8K<\/option>\n        <option value=\"32\">32K<\/option>\n        <option value=\"128\">128K<\/option>\n      <\/select>\n    <\/label>\n  <\/div>\n\n  <div id=\"llmc-verdict\" class=\"llmc-verdict\"><\/div>\n\n  <table class=\"llmc-table\">\n    <thead><tr><th>Quantization<\/th><th>Quality<\/th><th>Est. VRAM<\/th><th>Fits your GPU?<\/th><\/tr><\/thead>\n    <tbody id=\"llmc-body\"><\/tbody>\n  <\/table>\n  <p class=\"llmc-note\">Estimate based on model weights at each quantization + ~1.5&nbsp;GB runtime overhead. Long contexts add KV-cache memory on top (grows with the context length you pick). For mixture-of-experts models, all parameters must be loaded into VRAM even though only a subset compute per token.<\/p>\n<\/div>\n\n<script>(function(){\n  var QUANTS=[['FP16','full \/ lossless',2.0],['Q8_0','near-lossless',1.06],['Q6_K','excellent',0.82],['Q5_K_M','very good',0.69],['Q4_K_M','good (recommended)',0.58],['Q3_K_M','acceptable',0.43]];\n  var $=function(id){return document.getElementById(id);};\n  function params(){var m=$('llmc-model').value; return m?parseFloat(m):(parseFloat($('llmc-params').value)||0);}\n  function vram(){var g=$('llmc-gpu').value; return g==='custom'?(parseFloat($('llmc-vram').value)||0):parseFloat(g);}\n  function ctxKv(){var c=parseFloat($('llmc-ctx').value)||8; var p=params(); return (c\/8)*Math.max(0.5,p*0.03);} \/\/ rough KV add, scales with model+context\n  function render(){\n    var p=params(), gpu=vram(), over=1.5+ctxKv(), best=null, body=$('llmc-body'); body.innerHTML='';\n    QUANTS.forEach(function(qz){\n      var need=p*qz[2]+over, fits=need<=gpu;\n      if(fits&&!best)best=qz[0];\n      var tr=document.createElement('tr'); tr.className=fits?'llmc-ok':'llmc-no';\n      tr.innerHTML='<td><b>'+qz[0]+'<\/b><\/td><td>'+qz[1]+'<\/td><td>~'+need.toFixed(1)+' GB<\/td><td>'+(fits?'\u2705 Yes':'\u274c No')+'<\/td>';\n      body.appendChild(tr);\n    });\n    var v=$('llmc-verdict');\n    if(p<=0){v.className='llmc-verdict';v.textContent='Enter a model size to begin.';return;}\n    if(best){v.className='llmc-verdict llmc-vok';v.innerHTML='\u2705 Yes \u2014 you can run a <b>'+p+'B<\/b> model on <b>'+gpu+' GB<\/b> at <b>'+best+'<\/b> quantization.';}\n    else{var min=(p*0.43+over).toFixed(0);v.className='llmc-verdict llmc-vno';v.innerHTML='\u274c Not on '+gpu+' GB. A '+p+'B model needs ~<b>'+min+' GB<\/b> even at aggressive 3-bit. Try a smaller model, more VRAM, or CPU\/RAM offload.';}\n  }\n  $('llmc-model').addEventListener('change',function(){$('llmc-custom-wrap').style.display=this.value?'none':'';render();});\n  $('llmc-gpu').addEventListener('change',function(){$('llmc-gpu-custom-wrap').style.display=this.value==='custom'?'':'none';render();});\n  ['llmc-params','llmc-vram','llmc-ctx'].forEach(function(id){$(id).addEventListener('input',render);});\n  render();\n})();<\/script>\n\n<p>Pick a model from our <a href=\"\/models\/\">AI models database<\/a> (or enter a custom parameter count), choose your GPU, set your context length, and see instantly whether it runs \u2014 and at what quality. Estimates are weights-based; long contexts add KV-cache memory on top.<\/p>\n<p><!--geo-block--><\/p>\n<h2>Quick answer: how much VRAM do you need to run an LLM locally?<\/h2>\n<p>To run a large language model locally at 4-bit quantisation, budget roughly 0.5\u20130.6 GB of GPU VRAM per billion parameters, plus 1\u20133 GB for the KV cache at typical context lengths. In practice an 8B model fits in about 5\u20136 GB (comfortable on a 12GB RTX 3060; also runs on an 8GB RTX 4060), a 13B model needs ~8\u201310 GB, a 32B model wants a 24GB card like the RTX 4090 (~20 GB of weights), and a 70B model needs about 40\u201348 GB \u2014 a single 48GB card or two 24GB GPUs. At 8-bit those figures roughly double (~1\u20131.2 GB per billion) and at full fp16 they roughly quadruple (~2 GB per billion).<\/p>\n<p>Quick rules of thumb for the model weights, before adding context:<\/p>\n<ul>\n<li><strong>4-bit (Q4):<\/strong> ~0.5\u20130.6 GB per billion parameters<\/li>\n<li><strong>8-bit (Q8):<\/strong> ~1\u20131.2 GB per billion parameters<\/li>\n<li><strong>fp16 \/ bf16:<\/strong> ~2 GB per billion parameters<\/li>\n<li><strong>KV cache (context):<\/strong> add ~1\u20134 GB at 8K\u201332K tokens, more for very long context or larger models<\/li>\n<\/ul>\n<h2>Frequently asked questions<\/h2>\n<h3>How much VRAM do I need to run a 70B model?<\/h3>\n<p>A 70B model needs roughly 40\u201348 GB of VRAM at 4-bit quantisation \u2014 about 0.5\u20130.6 GB per billion parameters for the weights, plus the KV cache. That fits on a single 48GB card or two 24GB GPUs such as a pair of RTX 4090s. At 8-bit it roughly doubles to ~80 GB, and at fp16 you need around 140 GB.<\/p>\n<h3>Can my RTX 4060 or a 12GB GPU run an 8B or 7B model?<\/h3>\n<p>Yes. A 7B\u20138B model at 4-bit uses only about 4\u20135 GB of VRAM, so it runs comfortably on a 12GB card such as the RTX 3060, with plenty of headroom left for context. The RTX 4060 is an 8GB card, and it still handles a 7\u20138B model fine \u2014 you just have less room for long context windows.<\/p>\n<h3>How much VRAM does a 13B model need?<\/h3>\n<p>A 13B model needs about 8\u201310 GB of VRAM at 4-bit quantisation. It fits on a 12GB GPU for short-to-moderate context, but a 16GB card is safer once you add a longer context window and framework overhead.<\/p>\n<h3>Does a 24GB GPU like the RTX 4090 run a 32B or 70B model?<\/h3>\n<p>A 24GB GPU runs a 32B model well at 4-bit: the weights take about 20 GB, leaving a few GB for the KV cache. It cannot fit a 70B model at 4-bit \u2014 that needs ~40\u201348 GB \u2014 so you would need a second GPU, CPU offloading, or a lower-quality 2\u20133-bit quant.<\/p>\n<h3>How much GPU memory does an LLM use per billion parameters?<\/h3>\n<p>As a rule of thumb, budget ~0.5\u20130.6 GB per billion parameters at 4-bit, ~1\u20131.2 GB at 8-bit, and ~2 GB at fp16\/bf16. So a 30B model is roughly 16\u201318 GB at 4-bit, ~32 GB at 8-bit, and ~60 GB at fp16, before you add the KV cache.<\/p>\n<h3>How much extra VRAM does context length (the KV cache) use?<\/h3>\n<p>The KV cache grows linearly with context length and adds about 3 GB on an 8B model when going from 8K to 32K tokens (roughly 1 GB at 8K rising to about 4 GB at 32K), with larger models and longer contexts using more. Quantising the KV cache to 8-bit roughly halves that penalty, so always leave 1\u20134 GB of headroom on top of the model weights.<\/p>\n<h3>What model size can I run on 8GB or 16GB of VRAM?<\/h3>\n<p>On 8 GB of VRAM you can comfortably run 7\u20138B models at 4-bit; on 16 GB you can run up to ~13B comfortably and squeeze in a 20\u201324B model with a modest context. For a 32B model you want 24 GB, and for a 70B model around 48 GB.<\/p>\n<p><!--convly-tools--><br \/>\n<style>.ctools-wrap{margin:36px 0 10px;border-top:1px solid #e6e8ef;padding-top:22px}.ctools-h{font-weight:700;font-size:14px;color:#1a1a2e;margin:0 0 14px;text-transform:uppercase;letter-spacing:.05em}.ctools-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(210px,1fr));gap:12px}.ctool{display:flex;flex-direction:column;gap:5px;padding:15px 16px;border:1px solid #e6e8ef;border-radius:12px;background:#f8f9fb;text-decoration:none!important;transition:transform .15s,box-shadow .15s,border-color .15s}.ctool:hover{border-color:#2f6fed;background:#fff;box-shadow:0 8px 20px -10px rgba(47,111,237,.4);transform:translateY(-2px)}.ctool-e{font-size:23px;line-height:1}.ctool-t{font-weight:700;font-size:14.5px;color:#1a3ba3}.ctool-d{font-size:12.5px;color:#5a6472;line-height:1.4}<\/style><div class=\"ctools-wrap\"><p class=\"ctools-h\">More free tools from Convly<\/p><div class=\"ctools-grid\"><a class=\"ctool\" href=\"\/models\/\"><span class=\"ctool-e\">&#128202;<\/span><span class=\"ctool-t\">AI Models Database<\/span><span class=\"ctool-d\">30+ LLMs \u2014 specs, pricing &amp; context, side by side.<\/span><\/a><a class=\"ctool\" href=\"\/llm-leaderboard\/\"><span class=\"ctool-e\">&#127942;<\/span><span class=\"ctool-t\">LLM Leaderboard 2026<\/span><span class=\"ctool-d\">Rank every model by intelligence, price &amp; speed.<\/span><\/a><a class=\"ctool\" href=\"\/ai-api-cost-calculator\/\"><span class=\"ctool-e\">&#128181;<\/span><span class=\"ctool-t\">AI API Cost Calculator<\/span><span class=\"ctool-d\">Compare what each model costs you per month.<\/span><\/a><a class=\"ctool\" href=\"\/llm-vram-calculator\/\"><span class=\"ctool-e\">&#127918;<\/span><span class=\"ctool-t\">LLM VRAM Calculator<\/span><span class=\"ctool-d\">Can your GPU run that model locally? Find out.<\/span><\/a><a class=\"ctool\" href=\"\/self-hosting-vs-api-calculator\/\"><span class=\"ctool-e\">&#9878;<\/span><span class=\"ctool-t\">Self-Hosting vs API<\/span><span class=\"ctool-d\">Buy a GPU or pay per token? See the break-even.<\/span><\/a><a class=\"ctool\" href=\"\/compare-ai-models-and-gpus-2026\/\"><span class=\"ctool-e\">&#129504;<\/span><span class=\"ctool-t\">Compare Models &amp; GPUs<\/span><span class=\"ctool-d\">Every model paired with the GPU that runs it.<\/span><\/a><a class=\"ctool\" href=\"\/image-to-prompt\/\"><span class=\"ctool-e\">&#128444;<\/span><span class=\"ctool-t\">Image-to-Prompt<\/span><span class=\"ctool-d\">Turn any image into an editable AI prompt.<\/span><\/a><\/div><\/div><\/p>\n","protected":false},"excerpt":{"rendered":"<p>Wondering whether your GPU can run a given large language model locally? This free calculator estimates the VRAM each model [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"parent":0,"menu_order":0,"comment_status":"closed","ping_status":"closed","template":"","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"default","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","ast-disable-related-posts":"","theme-transparent-header-meta":"","adv-header-id-meta":"","stick-header-meta":"","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"default","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"footnotes":""},"class_list":["post-782","page","type-page","status-publish","hentry"],"_links":{"self":[{"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/pages\/782","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/pages"}],"about":[{"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/types\/page"}],"author":[{"embeddable":true,"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/comments?post=782"}],"version-history":[{"count":3,"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/pages\/782\/revisions"}],"predecessor-version":[{"id":1544,"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/pages\/782\/revisions\/1544"}],"wp:attachment":[{"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/media?parent=782"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}