alicia-ai-terminology/pages/math.html

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Math & Concepts - Cheat Sheet</title>
  <link rel="stylesheet" href="../css/style.css">
</head>
<body>

<nav>
  <div class="nav-inner">
    <a href="../index.html" class="nav-brand">AI Cheat Sheet</a>
    <div class="nav-links">
      <a href="/pages/terminology.html">Terminology</a>
      <a href="/pages/techniques.html">Techniques</a>
      <a href="/pages/use-cases.html">Use Cases</a>
      <a href="/pages/model-types.html">Model Types</a>
      <a href="/pages/prompts.html">Prompt Guide</a>
      <a href="/pages/math.html" class="active">Math & Concepts</a>
      <a href="/pages/chat.html">Chat</a>
      <a href="/pages/image-gen.html">Image Gen</a>
    </div>
    <button class="dark-toggle" id="darkToggle" aria-label="Toggle dark mode">🌙</button>
  </div>
</nav>

<button class="menu-toggle" id="menuToggle" aria-label="Toggle menu">☰</button>
<div class="sidebar-backdrop" id="sidebarBackdrop"></div>

<script>
(function(){
  var btn = document.getElementById('darkToggle');
  var saved = localStorage.getItem('theme');
  if(saved === 'dark' || (!saved && window.matchMedia('(prefers-color-scheme: dark)').matches)){
    document.documentElement.setAttribute('data-theme','dark');
    btn.textContent = '☀️';
  }
  btn.addEventListener('click', function(){
    var isDark = document.documentElement.getAttribute('data-theme') === 'dark';
    if(isDark){
      document.documentElement.removeAttribute('data-theme');
      btn.textContent = '🌙';
      localStorage.setItem('theme','light');
    } else {
      document.documentElement.setAttribute('data-theme','dark');
      btn.textContent = '☀️';
      localStorage.setItem('theme','dark');
    }
  });

  var menuToggle = document.getElementById('menuToggle');
  var nav = document.querySelector('nav');
  var backdrop = document.getElementById('sidebarBackdrop');
  if(menuToggle && nav){
    menuToggle.addEventListener('click', function(){
      nav.classList.toggle('sidebar-open');
      var isOpen = nav.classList.contains('sidebar-open');
      menuToggle.textContent = isOpen ? '✕' : '☰';
      if(backdrop){
        backdrop.classList.toggle('visible', isOpen);
      }
    });
    if(backdrop){
      backdrop.addEventListener('click', function(){
        nav.classList.remove('sidebar-open');
        menuToggle.textContent = '☰';
        backdrop.classList.remove('visible');
      });
    }
    document.addEventListener('click', function(e){
      if(nav.classList.contains('sidebar-open') && !nav.contains(e.target) && e.target !== menuToggle){
        nav.classList.remove('sidebar-open');
        menuToggle.textContent = '☰';
        if(backdrop) backdrop.classList.remove('visible');
      }
    });
  }
})();
</script>

<div class="hero">
  <h1>Math & Concepts</h1>
  <p>The underlying ideas that make AI work — explained simply.</p>
</div>

<div class="container">

  <h2 class="section-title">Core Concepts</h2>
  <div class="def-card">
    <span class="category">Architecture</span>
    <h3>Attention Mechanism</h3>
    <p>A way for the model to weigh the importance of different parts of the input when processing each token. "Attention is all you need" — the 2017 paper that launched the transformer revolution.</p>
    <div class="example"><strong>Analogy:</strong> When reading a sentence, you naturally pay more attention to certain words. "The cat that chased the mouse hid" — you attend to "cat" when processing "hid".</div>
    <button class="llm-btn" onclick="walkThrough('📐 Attention Mechanism', 'Walk through attention with a concrete numerical example. If we have tokens [A, B, C] with embeddings [1,0,0], [0,1,0], [0,0,1], show how Q, K, V are computed and how attention weights are calculated step by step.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Architecture</span>
    <h3>Self-Attention</h3>
    <p>Each token in a sequence attends to every other token, creating rich contextual representations. The core of the transformer architecture.</p>
    <div class="example"><strong>Math:</strong> Attention(Q, K, V) = softmax(QKᵀ / √dₖ) V</div>
    <button class="llm-btn" onclick="walkThrough('📐 Self-Attention', 'Walk through the self-attention formula step by step with a simple example. Show how Q, K, V matrices are computed, how the dot product creates attention scores, how softmax normalizes them, and how the final output is a weighted sum of V.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Architecture</span>
    <h3>Multi-Head Attention</h3>
    <p>Running multiple self-attention operations in parallel, each learning different types of relationships. Like having multiple "lenses" to view the input.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Multi-Head Attention', 'Walk through multi-head attention. If we have 4 heads with d_model=128, show how the model splits into 4 heads of dimension 32, runs self-attention on each, concatenates them, and projects back to dimension 128.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Architecture</span>
    <h3>Positional Encoding</h3>
    <p>Since transformers process all tokens simultaneously (unlike RNNs), position information must be added explicitly so the model knows word order.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Positional Encoding', 'Walk through positional encoding. Show how sinusoidal positional encodings work with a concrete example. If we have positions 0, 1, 2, 3 and dimension 4, show the actual encoding vectors and explain why sin/cos are used.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Architecture</span>
    <h3>Feed-Forward Network (FFN)</h3>
    <p>After attention, each token passes through a small neural network that transforms its representation. Usually two linear layers with a non-linearity in between.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Feed-Forward Network (FFN)', 'Walk through the FFN in a transformer. Show the dimensions: if d_model=512 and d_ff=2048, show how each token goes through Linear(512, 2048) → ReLU → Linear(2048, 512). Give a concrete numerical example.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Architecture</span>
    <h3>Layer Normalization</h3>
    <p>A technique to stabilize training by normalizing the activations of each layer. Helps gradients flow more smoothly through deep networks.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Layer Normalization', 'Walk through layer normalization step by step. If a layer outputs [2, -1, 3, 0] for a single token, show how to compute the mean, variance, normalize, and then apply the learnable parameters γ and β.')"><span class="icon">📐</span> Walk me through</button>
  </div>

  <h2 class="section-title">Training Concepts</h2>
  <div class="def-card">
    <span class="category">Training</span>
    <h3>Loss Function</h3>
    <p>A mathematical measure of how far the model's predictions are from the correct answers. Training = minimizing this value. For language models, cross-entropy loss is standard.</p>
    <div class="example"><strong>Example:</strong> If the correct next word is "cat" but the model assigns it 10% probability, the loss is high. If it assigns 90%, the loss is low.</div>
    <button class="llm-btn" onclick="walkThrough('📐 Loss Function (Cross-Entropy)', 'Walk through cross-entropy loss with a concrete example. If the true class is "cat" and the model outputs probabilities [dog: 0.5, cat: 0.3, bird: 0.2], show the exact calculation of -log(0.3) and explain what this number means.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Training</span>
    <h3>Gradient Descent</h3>
    <p>The optimization algorithm that adjusts model weights in the direction that reduces loss. "Descent" because you're moving down the loss surface toward a minimum.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Gradient Descent', 'Walk through gradient descent with a concrete example. If loss(w) = (w - 3)², compute the gradient, show a weight update with learning rate 0.1 starting from w=0. Do 3 iterations and show how w approaches 3.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Training</span>
    <h3>Adam Optimizer</h3>
    <p>The most popular optimizer for training deep learning models. Combines momentum (acceleration) with adaptive learning rates (per-parameter tuning).</p>
    <button class="llm-btn" onclick="walkThrough('📐 Adam Optimizer', 'Walk through Adam optimizer step by step. Starting from gradient g=2 at step t=1, show how m (momentum) and v (variance) are computed with β1=0.9, β2=0.999, then show bias correction and the parameter update with lr=0.001.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Training</span>
    <h3>Gradient</h3>
    <p>A vector of partial derivatives showing the direction and rate of steepest increase of the loss. We move in the opposite direction to minimize loss.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Gradient', 'Walk through what a gradient is with a concrete example. If loss = w1² + 2w2², compute ∂loss/∂w1 and ∂loss/∂w2 at the point (w1=1, w2=2). Show what the gradient vector looks like and what direction we move in.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Training</span>
    <h3>Regularization</h3>
    <p>Techniques to prevent overfitting: dropout (randomly deactivating neurons), weight decay (penalizing large weights), and early stopping.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Regularization', 'Walk through L2 regularization (weight decay) with a concrete example. If loss = MSE + λ·Σw², show how the gradient changes with λ=0.01 vs λ=0. What happens to the weights over training?')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Training</span>
    <h3>Batch Normalization</h3>
    <p>Normalizing layer inputs across each mini-batch. Reduces internal covariate shift and allows higher learning rates.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Batch Normalization', 'Walk through batch normalization step by step. If a mini-batch has activations [1, 3, 5, 7], show how to compute the batch mean and variance, normalize, and apply γ=1.5, β=0.5. Show the final output.')"><span class="icon">📐</span> Walk me through</button>
  </div>

  <h2 class="section-title">Generation & Sampling</h2>
  <div class="def-card">
    <span class="category">Sampling</span>
    <h3>Temperature</h3>
    <p>Controls randomness in text generation. Low (0.2) = focused and deterministic. High (0.9) = creative and varied. 1.0 = standard sampling.</p>
    <div class="example"><strong>Low temp:</strong> Technical documentation, code generation<br>
      <strong>High temp:</strong> Creative writing, brainstorming</div>
    <button class="llm-btn" onclick="walkThrough('📐 Temperature in Sampling', 'Walk through temperature with a concrete example. If a model outputs logits [2.0, 1.0, 0.0] for tokens [cat, dog, bird], show the probabilities at temperature=0.1, temperature=1.0, and temperature=2.0. Show the softmax math step by step.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Sampling</span>
    <h3>Top-K Sampling</h3>
    <p>At each step, only consider the K most likely next tokens. Reduces weird or irrelevant outputs.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Top-K Sampling', 'Walk through Top-K sampling with an example. If a model outputs probabilities [cat: 0.4, dog: 0.25, bird: 0.15, fish: 0.1, car: 0.05, house: 0.03, tree: 0.02], show what happens with K=3 vs K=10. Show the renormalization.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Sampling</span>
    <h3>Top-P (Nucleus) Sampling</h3>
    <p>Only consider tokens whose cumulative probability reaches P. More adaptive than Top-K — automatically adjusts the number of candidates.</p>
    <div class="example"><strong>Top-P = 0.9:</strong> Include the smallest set of tokens that together cover 90% probability mass.</div>
    <button class="llm-btn" onclick="walkThrough('📐 Top-P (Nucleus) Sampling', 'Walk through Top-P sampling with the same example: [cat: 0.4, dog: 0.25, bird: 0.15, fish: 0.1, car: 0.05, house: 0.03, tree: 0.02]. Show which tokens are included at P=0.9 and P=0.95, and how the probabilities are renormalized.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Sampling</span>
    <h3>Greedy Decoding</h3>
    <p>Always pick the most likely next token. Fastest but can get stuck in repetitive loops. Often produces the most coherent output for factual tasks.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Greedy Decoding', 'Walk through greedy decoding vs beam search with a concrete example. If at step 1 the model outputs [the: 0.5, a: 0.3, I: 0.2], show what greedy picks. At step 2, if the continuation probabilities depend on the first token, show how greedy might get stuck vs beam search.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Sampling</span>
    <h3>Beam Search</h3>
    <p>Instead of picking the single best token at each step, keep the top B sequences and pick the best overall. Better quality but slower.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Beam Search', 'Walk through beam search with beam size 2. Show how at each step, B sequences are expanded, pruned, and scored. Use a simple example with 3 possible tokens at each step. Show how beam search might find a better overall sequence than greedy.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Sampling</span>
    <h3>Logits</h3>
    <p>The raw, unnormalized scores the model outputs for each token before softmax. Can be adjusted for bias correction, repetition penalties, and custom sampling.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Logits', 'Walk through logits step by step. If the model outputs logits [2.0, 1.0, 0.0] for [cat, dog, bird], show: 1) softmax to get probabilities, 2) log to get log-probabilities, 3) how temperature scales logits, 4) how top-k filtering modifies them.')"><span class="icon">📐</span> Walk me through</button>
  </div>

  <h2 class="section-title">Evaluation Metrics</h2>
  <div class="def-card">
    <span class="category">Metrics</span>
    <h3>Perplexity</h3>
    <p>Measures how "surprised" the model is by test data. Lower is better. A perplexity of 100 means the model is as confused as choosing uniformly from 100 options.</p>
    <div class="example"><strong>Example:</strong> Perplexity 5 on a language model means, on average, it's as uncertain as picking from 5 equally likely options at each step.</div>
    <button class="llm-btn" onclick="walkThrough('📐 Perplexity', 'Walk through perplexity calculation step by step. If a model assigns probabilities [0.9, 0.8, 0.7, 0.6] to 4 correct tokens, show: 1) cross-entropy loss calculation, 2) how perplexity = 2^cross-entropy, 3) the final perplexity value and what it means.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Metrics</span>
    <h3>Accuracy</h3>
    <p>Percentage of correct predictions. Simple but can be misleading for imbalanced datasets.</p>
    <button class="llm-btn" onclick="walkThrough('📐 Accuracy', 'Walk through accuracy with a concrete example. If you have 1000 examples where 950 are class A and 50 are class B, and your model predicts everything as class A, what is the accuracy? Why is this misleading? Show a better metric.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Metrics</span>
    <h3>Precision & Recall</h3>
    <p>Precision = of all positive predictions, how many were correct? Recall = of all actual positives, how many did we find?</p>
    <div class="example"><strong>Spam filter:</strong> High precision = few legitimate emails flagged. High recall = few spam emails missed.</div>
    <button class="llm-btn" onclick="walkThrough('📐 Precision & Recall', 'Walk through precision and recall with a spam filter example. If the filter flags 100 emails as spam, and 80 are actually spam (20 are legitimate), and there are 10 total spam emails in the inbox, compute precision, recall, and F1.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Metrics</span>
    <h3>F1 Score</h3>
    <p>The harmonic mean of precision and recall. A single metric that balances both.</p>
    <button class="llm-btn" onclick="walkThrough('📐 F1 Score', 'Walk through F1 score calculation. If precision = 0.8 and recall = 0.6, show: 1) the harmonic mean formula, 2) the step-by-step calculation, 3) why harmonic mean is used instead of arithmetic mean, 4) what F1 = 0.7 means.')"><span class="icon">📐</span> Walk me through</button>
  </div>
  <div class="def-card">
    <span class="category">Metrics</span>
    <h3>BLEU / ROUGE</h3>
    <p>Metrics for evaluating text generation quality by comparing model output to reference text. BLEU counts n-gram overlap (used for translation). ROUGE is similar but common for summarization.</p>
    <button class="llm-btn" onclick="walkThrough('📐 BLEU Score', 'Walk through BLEU score calculation. If the model outputs "the cat sat on the mat" and the reference is "the cat sat on the mat", compute 1-gram, 2-gram, 3-gram, and 4-gram precision. Show the final BLEU score with the brevity penalty.')"><span class="icon">📐</span> Walk me through</button>
  </div>

  <h2 class="section-title">Key Formulas</h2>
  <table class="glossary-table">
    <thead>
      <tr><th>Concept</th><th>Formula</th><th>What it means</th></tr>
    </thead>
    <tbody>
      <tr><td>Attention</td><td>softmax(QKᵀ/√dₖ)V</td><td>Weigh inputs by relevance</td></tr>
      <tr><td>Cross-Entropy Loss</td><td>-Σ yᵢ log(pᵢ)</td><td>Penalizes wrong predictions</td></tr>
      <tr><td>Softmax</td><td>eˣⁱ / Σeˣʲ</td><td>Converts scores to probabilities</td></tr>
      <tr><td>ReLU</td><td>max(0, x)</td><td>Activation: passes positive values only</td></tr>
      <tr><td>Layer Norm</td><td>(x - μ) / σ × γ + β</td><td>Normalizes per-sample activations</td></tr>
      <tr><td>F1 Score</td><td>2 × (P×R)/(P+R)</td><td>Harmonic mean of precision & recall</td></tr>
      <tr><td>Perplexity</td><td>2^(cross-entropy)</td><td>Effective branching factor</td></tr>
    </tbody>
  </table>

</div>

<footer>AI Cheat Sheet &mdash; A learning reference for artificial intelligence</footer>

<div class="search-results-dropdown" id="searchResultsContainer"></div>

<script src="../lib/modal.js"></script>
<script src="../lib/llm.js"></script>
<script src="../lib/search.js"></script>
<script>Search.init();</script>
<script>
(function(){
  function walkThrough(title, prompt) {
    LLMModal.open(title);
    var messages = [
      { role: 'system', content: 'You are an AI math tutor. Walk through the requested concept step by step with concrete numerical examples. Show every calculation explicitly. Use code blocks for math. Explain each step in plain language. Make it feel like a patient teacher working through a problem.' },
      { role: 'user', content: prompt }
    ];

    var fullText = '';
    LLM.callAPI(
      messages,
      function(chunk) {
        fullText += chunk;
        LLMModal.update(fullText);
      },
      function() {},
      function(err) {
        LLMModal.error(err);
      }
    );
  }

  window.walkThrough = walkThrough;
})();
</script>

</body>
</html>