{"id":782,"date":"2026-05-31T15:26:25","date_gmt":"2026-05-31T15:26:25","guid":{"rendered":"https:\/\/convly.ai\/llm-vram-calculator\/"},"modified":"2026-05-31T15:26:25","modified_gmt":"2026-05-31T15:26:25","slug":"llm-vram-calculator","status":"publish","type":"page","link":"https:\/\/convly.ai\/ar\/llm-vram-calculator\/","title":{"rendered":"\u062d\u0627\u0633\u0628\u0629 LLM VRAM - \u0645\u0642\u062f\u0627\u0631 \u0630\u0627\u0643\u0631\u0629 \u0648\u062d\u062f\u0629 \u0645\u0639\u0627\u0644\u062c\u0629 \u0627\u0644\u0631\u0633\u0648\u0645\u0627\u062a \u0644\u062a\u0634\u063a\u064a\u0644 \u0623\u064a \u0646\u0645\u0648\u0630\u062c \u0630\u0643\u0627\u0621 \u0627\u0635\u0637\u0646\u0627\u0639\u064a"},"content":{"rendered":"<p class=\"convly-lede\">Wondering how much GPU memory you need to run a large language model locally? This free calculator estimates the VRAM required for any LLM \u2014 based on its size, your quantization, and your context length \u2014 then shows you exactly which GPUs and Macs can run it.<\/p>\n    <div class=\"convly-vram\" id=\"convly-vram\">\n      <div class=\"cv-grid\">\n        <div class=\"cv-controls\">\n          <label class=\"cv-field\">\n            <span>Model<\/span>\n            <select id=\"cv-model\"><\/select>\n          <\/label>\n          <label class=\"cv-field cv-custom\" id=\"cv-custom-wrap\" style=\"display:none;\">\n            <span>Parameters (billions)<\/span>\n            <input type=\"number\" id=\"cv-params\" value=\"13\" min=\"0.1\" max=\"2000\" step=\"0.1\">\n          <\/label>\n          <label class=\"cv-field\">\n            <span>Quantization<\/span>\n            <select id=\"cv-quant\">\n              <option value=\"2.0\">FP16 \/ BF16 (full precision)<\/option>\n              <option value=\"1.0\">FP8 \/ INT8 (Q8)<\/option>\n              <option value=\"0.82\">Q6_K<\/option>\n              <option value=\"0.69\">Q5_K_M<\/option>\n              <option value=\"0.58\" selected>Q4_K_M (most popular)<\/option>\n              <option value=\"0.46\">Q3_K_M<\/option>\n              <option value=\"0.35\">Q2_K (extreme)<\/option>\n            <\/select>\n          <\/label>\n          <label class=\"cv-field\">\n            <span>Context length: <strong id=\"cv-ctx-label\">8K<\/strong> tokens<\/span>\n            <input type=\"range\" id=\"cv-ctx\" min=\"2048\" max=\"131072\" step=\"2048\" value=\"8192\">\n          <\/label>\n          <label class=\"cv-field\">\n            <span>Concurrent requests (batch size)<\/span>\n            <input type=\"number\" id=\"cv-batch\" value=\"1\" min=\"1\" max=\"64\" step=\"1\">\n          <\/label>\n        <\/div>\n\n        <div class=\"cv-result\">\n          <div class=\"cv-total\">\n            <span class=\"cv-total-label\">Estimated VRAM needed<\/span>\n            <span class=\"cv-total-num\" id=\"cv-total\">&mdash;<\/span>\n          <\/div>\n          <div class=\"cv-break\">\n            <div><span>Model weights<\/span><strong id=\"cv-w\">&mdash;<\/strong><\/div>\n            <div><span>KV cache (context)<\/span><strong id=\"cv-kv\">&mdash;<\/strong><\/div>\n            <div><span>Overhead<\/span><strong id=\"cv-oh\">&mdash;<\/strong><\/div>\n          <\/div>\n        <\/div>\n      <\/div>\n\n      <h3 class=\"cv-gpu-head\">Which hardware can run it?<\/h3>\n      <div class=\"cv-gpus\" id=\"cv-gpus\"><\/div>\n      <p class=\"cv-note\">Estimate only. Real usage varies with framework (llama.cpp, vLLM, ExLlama), KV-cache precision, and flash-attention. KV cache is computed at FP16; MoE models are sized by total parameters for weights. Use this as a planning guide, not a guarantee.<\/p>\n    <\/div>\n\n    <style>\n    .convly-vram{font-family:Inter,system-ui,sans-serif;max-width:880px;margin:1.5rem auto;color:#141414;}\n    .convly-vram *{box-sizing:border-box;}\n    .convly-vram .cv-grid{display:grid;grid-template-columns:1fr 1fr;gap:1.25rem;align-items:start;}\n    @media(max-width:680px){.convly-vram .cv-grid{grid-template-columns:1fr;}}\n    .convly-vram .cv-controls{display:flex;flex-direction:column;gap:1rem;}\n    .convly-vram .cv-field{display:block;font-size:.85rem;font-weight:600;}\n    .convly-vram .cv-field>span{display:block;margin-bottom:.4rem;}\n    .convly-vram .cv-field select,.convly-vram .cv-field input[type=number]{width:100%;padding:.6rem .7rem;border:1px solid #d4d4d4;border-radius:8px;font:inherit;font-weight:500;background:#fff;color:#141414;}\n    .convly-vram .cv-field input[type=range]{width:100%;accent-color:#bb1919;}\n    .convly-vram .cv-field select:focus,.convly-vram .cv-field input:focus{outline:none;border-color:#bb1919;box-shadow:0 0 0 3px rgba(187,25,25,.12);}\n    .convly-vram .cv-result{background:#141414;color:#fff;border-radius:14px;padding:1.5rem;position:sticky;top:1rem;}\n    .convly-vram .cv-total{text-align:center;border-bottom:1px solid #333;padding-bottom:1rem;margin-bottom:1rem;}\n    .convly-vram .cv-total-label{display:block;font-size:.75rem;color:#a3a3a3;text-transform:uppercase;letter-spacing:.06em;}\n    .convly-vram .cv-total-num{display:block;font-size:2.8rem;font-weight:800;line-height:1.1;margin-top:.3rem;}\n    .convly-vram .cv-total-num span{font-size:1.3rem;color:#ef4444;}\n    .convly-vram .cv-break div{display:flex;justify-content:space-between;font-size:.9rem;padding:.35rem 0;color:#d4d4d4;}\n    .convly-vram .cv-break strong{color:#fff;}\n    .convly-vram .cv-gpu-head{font-family:'Source Serif 4',Georgia,serif;font-size:1.25rem;margin:1.75rem 0 .9rem;}\n    .convly-vram .cv-gpus{display:grid;grid-template-columns:repeat(auto-fill,minmax(220px,1fr));gap:.6rem;}\n    .convly-vram .cv-gpu{display:flex;align-items:center;justify-content:space-between;padding:.65rem .8rem;border:1px solid #e5e5e5;border-radius:9px;font-size:.88rem;}\n    .convly-vram .cv-gpu.fit{background:#f0fdf4;border-color:#bbf7d0;}\n    .convly-vram .cv-gpu.tight{background:#fffbeb;border-color:#fde68a;}\n    .convly-vram .cv-gpu.no{background:#fef2f2;border-color:#fecaca;opacity:.7;}\n    .convly-vram .cv-gpu b{font-weight:700;}\n    .convly-vram .cv-gpu .cv-vram{color:#737373;font-weight:600;margin-left:.4rem;}\n    .convly-vram .cv-gpu .cv-badge{font-weight:800;white-space:nowrap;}\n    .convly-vram .cv-gpu.fit .cv-badge{color:#15803d;}\n    .convly-vram .cv-gpu.tight .cv-badge{color:#b45309;}\n    .convly-vram .cv-gpu.no .cv-badge{color:#b91c1c;}\n    .convly-vram .cv-note{font-size:.78rem;color:#737373;margin-top:1rem;line-height:1.5;}\n    <\/style>\n\n    <script>\n    (function(){\n      var MODELS=[\n        [\"Llama 3.1 8B\",8.0,32,8,128],\n        [\"Llama 3.1 70B\",70.6,80,8,128],\n        [\"Llama 3.1 405B\",405,126,8,128],\n        [\"Mistral 7B v0.3\",7.2,32,8,128],\n        [\"Mixtral 8x7B (MoE)\",46.7,32,8,128],\n        [\"Mixtral 8x22B (MoE)\",141,56,8,128],\n        [\"Qwen 2.5 7B\",7.6,28,4,128],\n        [\"Qwen 2.5 32B\",32.5,64,8,128],\n        [\"Qwen 2.5 72B\",72.7,80,8,128],\n        [\"Gemma 2 9B\",9.2,42,8,256],\n        [\"Gemma 2 27B\",27.2,46,16,128],\n        [\"DeepSeek V3\/V4 671B (MoE)\",671,61,128,128],\n        [\"Command R 35B\",35,40,8,128],\n        [\"Yi 1.5 34B\",34,60,8,128],\n        [\"Phi-3 medium 14B\",14,40,10,128],\n        [\"-- Custom --\",13,40,16,128]\n      ];\n      var GPUS=[\n        [\"RTX 4060 Ti\",16],[\"RTX 4070 Ti Super\",16],[\"RTX 4080 Super\",16],[\"RTX 5080\",16],\n        [\"RTX 3090\",24],[\"RTX 4090\",24],[\"RTX 5090\",32],\n        [\"2x RTX 4090\",48],[\"RTX A6000\",48],[\"A100 40GB\",40],[\"A100 80GB\",80],\n        [\"H100 80GB\",80],[\"H200\",141],[\"B200\",192],\n        [\"Mac M4 Pro 48GB\",48],[\"Mac M4 Max 128GB\",128],[\"Mac M4 Ultra 256GB\",256],[\"Mac M3 Ultra 512GB\",512],\n        [\"4x RTX 4090\",96],[\"8x H100\",640]\n      ];\n      var $=function(id){return document.getElementById(id);};\n      if(!$(\"cv-model\"))return;\n      var mSel=$(\"cv-model\");\n      MODELS.forEach(function(m,i){var o=document.createElement(\"option\");o.value=i;o.textContent=m[0];if(i===1)o.selected=true;mSel.appendChild(o);});\n      function fmtCtx(v){return v>=1024?(v\/1024)+\"K\":v;}\n      function gb(bytes){return bytes\/Math.pow(1024,3);}\n      function calc(){\n        var m=MODELS[+mSel.value];\n        var isCustom=m[0]===\"-- Custom --\";\n        $(\"cv-custom-wrap\").style.display=isCustom?\"block\":\"none\";\n        var params=isCustom?(parseFloat($(\"cv-params\").value)||0):m[1];\n        var layers=m[2],kvHeads=m[3],headDim=m[4];\n        if(isCustom){layers=Math.max(24,Math.round(8*Math.cbrt(params)));kvHeads=8;headDim=128;}\n        var bpp=parseFloat($(\"cv-quant\").value);\n        var ctx=parseInt($(\"cv-ctx\").value,10);\n        var batch=Math.max(1,parseInt($(\"cv-batch\").value,10)||1);\n        $(\"cv-ctx-label\").textContent=fmtCtx(ctx);\n        var weights=params*1e9*bpp;\n        var kv=2*layers*kvHeads*headDim*ctx*batch*2;\n        var base=weights+kv;\n        var overhead=Math.max(1.5*Math.pow(1024,3),0.08*base);\n        var total=base+overhead;\n        var tGB=gb(total);\n        $(\"cv-total\").innerHTML=tGB.toFixed(1)+\" <span>GB<\/span>\";\n        $(\"cv-w\").textContent=gb(weights).toFixed(1)+\" GB\";\n        $(\"cv-kv\").textContent=gb(kv).toFixed(1)+\" GB\";\n        $(\"cv-oh\").textContent=gb(overhead).toFixed(1)+\" GB\";\n        var box=$(\"cv-gpus\");box.innerHTML=\"\";\n        GPUS.forEach(function(g){\n          var usable=g[1]*0.92;var need=tGB;var cls,badge;\n          if(need<=usable){cls=\"fit\";badge=\"\u2713 Runs\";}\n          else if(need<=g[1]){cls=\"tight\";badge=\"\u26a0 Tight\";}\n          else{cls=\"no\";badge=\"\u2717 Too small\";}\n          var d=document.createElement(\"div\");d.className=\"cv-gpu \"+cls;\n          d.innerHTML='<span><b>'+g[0]+'<\/b><span class=\"cv-vram\">'+g[1]+'GB<\/span><\/span><span class=\"cv-badge\">'+badge+'<\/span>';\n          box.appendChild(d);\n        });\n      }\n      [\"cv-model\",\"cv-quant\",\"cv-ctx\",\"cv-batch\",\"cv-params\"].forEach(function(id){\n        var el=$(id);if(el){el.addEventListener(\"input\",calc);el.addEventListener(\"change\",calc);}\n      });\n      calc();\n    })();\n    <\/script>\n    \n<h2>How to use the VRAM calculator<\/h2>\n<ol>\n<li><strong>Pick your model<\/strong> \u2014 choose a popular model (Llama 3, Qwen, Mixtral, DeepSeek, Gemma\u2026) or select &#8220;Custom&#8221; and enter any parameter count.<\/li>\n<li><strong>Choose a quantization<\/strong> \u2014 full precision (FP16) is the most accurate but largest; Q4_K_M is the most popular balance of size and quality. Lower quants shrink VRAM at some quality cost.<\/li>\n<li><strong>Set your context length<\/strong> \u2014 longer context means a bigger KV cache and more VRAM. Most chat use cases are fine at 8K\u201332K.<\/li>\n<li><strong>Read the result<\/strong> \u2014 the total VRAM estimate breaks down into model weights, KV cache, and overhead, and every GPU is marked \u2713 Runs, \u26a0 Tight, or \u2717 Too small.<\/li>\n<\/ol>\n<h2>How LLM VRAM is calculated<\/h2>\n<p>The memory an LLM needs to run (inference) comes from three parts:<\/p>\n<ul>\n<li><strong>Model weights<\/strong> = parameters \u00d7 bytes-per-parameter. At FP16 that&#8217;s 2 bytes per parameter, so a 70B model needs ~140&nbsp;GB. Quantizing to 4-bit (Q4) cuts that to roughly 40&nbsp;GB.<\/li>\n<li><strong>KV cache<\/strong> = the attention key\/value memory that grows with context length and batch size. For long contexts it can rival the weights themselves.<\/li>\n<li><strong>Overhead<\/strong> = activations, CUDA\/Metal buffers, and framework reserves \u2014 typically 5\u201315% on top.<\/li>\n<\/ul>\n<p>The quick rule of thumb: <strong>VRAM \u2248 (parameters in billions \u00d7 bytes-per-parameter) + KV cache + ~10% overhead.<\/strong> The calculator above does the full math for you, including per-model layer and attention-head counts.<\/p>\n<h2>Quantization quick reference<\/h2>\n<table class=\"convly-vs\">\n<thead>\n<tr>\n<th>Quantization<\/th>\n<th>Bytes \/ param<\/th>\n<th>70B model weights<\/th>\n<th>Quality<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td>FP16 \/ BF16<\/td>\n<td>2.0<\/td>\n<td>~141 GB<\/td>\n<td>Reference (best)<\/td>\n<\/tr>\n<tr>\n<td>Q8 \/ FP8<\/td>\n<td>1.0<\/td>\n<td>~70 GB<\/td>\n<td>Near-lossless<\/td>\n<\/tr>\n<tr>\n<td class=\"convly-vs-winner\">Q4_K_M<\/td>\n<td>~0.58<\/td>\n<td>~41 GB<\/td>\n<td>Best balance (recommended)<\/td>\n<\/tr>\n<tr>\n<td>Q3_K_M<\/td>\n<td>~0.46<\/td>\n<td>~33 GB<\/td>\n<td>Noticeable loss<\/td>\n<\/tr>\n<tr>\n<td>Q2_K<\/td>\n<td>~0.35<\/td>\n<td>~25 GB<\/td>\n<td>Last resort<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h2>Frequently asked questions<\/h2>\n<h3>How much VRAM do I need to run Llama 3 70B?<\/h3>\n<p>At Q4_K_M with an 8K context, roughly <strong>43\u201348 GB<\/strong> \u2014 so a single 48 GB card (RTX A6000) or two 24 GB GPUs (2\u00d7 RTX 4090\/3090), or a 64 GB+ Mac. At full FP16 you&#8217;d need ~150 GB (an A100 80GB pair or an H200). Use the calculator above for your exact settings.<\/p>\n<h3>Can I run a 70B model on a 24 GB GPU like the RTX 4090?<\/h3>\n<p>Not at useful quality on a single 24 GB card \u2014 even Q3 puts a 70B model around 35 GB. You can run it across two 24 GB GPUs, or step down to a 32B-class model (Qwen 2.5 32B) which fits comfortably at Q4 on a single 4090.<\/p>\n<h3>Does quantization hurt quality?<\/h3>\n<p>A little. Q8 is effectively lossless; Q4_K_M loses very little for most tasks and is the community default; below Q3 the degradation becomes noticeable. For coding and reasoning, stay at Q4 or higher when you can.<\/p>\n<h3>Why does context length increase VRAM so much?<\/h3>\n<p>The KV cache stores attention state for every token in the context, for every layer. Doubling the context roughly doubles the KV cache. At very long contexts (128K), the cache alone can exceed the model weights \u2014 which is why long-context inference needs so much memory.<\/p>\n<h3>How accurate is this calculator?<\/h3>\n<p>It&#8217;s a planning estimate, typically within ~10\u201315% of real-world usage. Actual memory depends on your framework (llama.cpp, vLLM, ExLlama), whether you quantize the KV cache, flash-attention, and OS reserves. Always leave headroom.<\/p>\n<p><em>Building or buying a rig for local AI? See our guides on the <a href=\"\/best-gpus-for-local-llms-2026\/\">best GPUs for local LLMs<\/a> and <a href=\"\/vram-requirements-every-major-llm-2026\/\">VRAM requirements for every major LLM<\/a>.<\/em><\/p>\n","protected":false},"excerpt":{"rendered":"<p>Wondering how much GPU memory you need to run a large language model locally? This free calculator estimates the VRAM [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"parent":0,"menu_order":0,"comment_status":"closed","ping_status":"closed","template":"","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"default","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","ast-disable-related-posts":"","theme-transparent-header-meta":"","adv-header-id-meta":"","stick-header-meta":"","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"default","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"footnotes":""},"class_list":["post-782","page","type-page","status-publish","hentry"],"_links":{"self":[{"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/pages\/782","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/pages"}],"about":[{"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/types\/page"}],"author":[{"embeddable":true,"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/comments?post=782"}],"version-history":[{"count":0,"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/pages\/782\/revisions"}],"wp:attachment":[{"href":"https:\/\/convly.ai\/ar\/wp-json\/wp\/v2\/media?parent=782"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}