aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--README.md28
-rw-r--r--build_index.py192
-rw-r--r--cache_model.py34
-rw-r--r--cards.jsonl31
-rw-r--r--cheat.dbbin0 -> 118784 bytes
-rw-r--r--init_db.py65
-rw-r--r--inspect_db.py34
-rw-r--r--query_index.py148
-rw-r--r--requirements.txt2
10 files changed, 536 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5b7a10c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+venv
+models \ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..534001e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,28 @@
+# cheat
+
+Local command-helper retrieval system using JSONL, SQLite, and sentence-transformers.
+
+## Setup
+
+```sh
+export HF_HOME="$PWD/models/hf"
+export SENTENCE_TRANSFORMERS_HOME="$PWD/models/hf"
+python -m venv venv
+source .venv/bin/activate
+pip install -U pip
+pip install -r requirements.txt
+python scripts/init_db.py
+python scripts/build_index.py
+```
+
+Then run a query like this:
+
+```sh
+python scripts/query_index.py "get free disk space"
+```
+
+To add commands, add to `./cards.jsonl` and rebuild the index:
+
+```sh
+python scripts/build_index.py
+```
diff --git a/build_index.py b/build_index.py
new file mode 100644
index 0000000..8597c68
--- /dev/null
+++ b/build_index.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import json
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from sentence_transformers import SentenceTransformer
+
+import os
+from pathlib import Path
+
+DB_PATH = Path("cheat.db")
+CARDS_PATH = Path("./cards.jsonl")
+MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+LOCAL_CACHE_DIR = Path("models/hf")
+
+LOCAL_CACHE_DIR = Path("models/hf")
+os.environ.setdefault("HF_HOME", str(LOCAL_CACHE_DIR.resolve()))
+os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(LOCAL_CACHE_DIR.resolve()))
+
+REQUIRED_FIELDS = {
+ "id",
+ "intent",
+ "command",
+ "alternatives",
+ "explanation",
+ "requires",
+ "packages",
+ "tags",
+ "platform",
+ "shell",
+ "safety",
+}
+
+
+def load_cards(path: Path) -> list[dict[str, Any]]:
+ cards: list[dict[str, Any]] = []
+ with path.open("r", encoding="utf-8") as f:
+ for line_no, line in enumerate(f, start=1):
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ card = json.loads(line)
+ except json.JSONDecodeError as e:
+ raise ValueError(f"Invalid JSON on line {line_no}: {e}") from e
+
+ missing = REQUIRED_FIELDS - set(card.keys())
+ if missing:
+ raise ValueError(f"Missing required fields on line {line_no}: {sorted(missing)}")
+
+ cards.append(card)
+ return cards
+
+
+def build_search_text(card: dict[str, Any]) -> str:
+ """
+ Build a compact semantic representation for embedding.
+ This is what the retriever will search over.
+ """
+ parts: list[str] = []
+
+ intents = card.get("intent", [])
+ tags = card.get("tags", [])
+ command = card.get("command", "")
+ explanation = card.get("explanation", "")
+ alternatives = card.get("alternatives", [])
+ requires = card.get("requires", [])
+ platform = card.get("platform", [])
+
+ if intents:
+ parts.append("Intents: " + " | ".join(intents))
+ if tags:
+ parts.append("Tags: " + ", ".join(tags))
+ if command:
+ parts.append("Command: " + command)
+ if alternatives:
+ parts.append("Alternatives: " + " | ".join(alternatives))
+ if explanation:
+ parts.append("Explanation: " + explanation)
+ if requires:
+ parts.append("Requires: " + ", ".join(requires))
+ if platform:
+ parts.append("Platform: " + ", ".join(platform))
+
+ return "\n".join(parts)
+
+
+def serialize_embedding(vec: np.ndarray) -> bytes:
+ return vec.astype(np.float32).tobytes()
+
+
+def upsert_card(conn: sqlite3.Connection, card: dict[str, Any], search_text: str) -> None:
+ conn.execute("""
+ INSERT INTO cards (
+ id, command, explanation, intent_json, alternatives_json, requires_json,
+ packages_json, tags_json, platform_json, shell_json, safety, search_text, updated_at
+ )
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
+ ON CONFLICT(id) DO UPDATE SET
+ command=excluded.command,
+ explanation=excluded.explanation,
+ intent_json=excluded.intent_json,
+ alternatives_json=excluded.alternatives_json,
+ requires_json=excluded.requires_json,
+ packages_json=excluded.packages_json,
+ tags_json=excluded.tags_json,
+ platform_json=excluded.platform_json,
+ shell_json=excluded.shell_json,
+ safety=excluded.safety,
+ search_text=excluded.search_text,
+ updated_at=CURRENT_TIMESTAMP
+ """, (
+ card["id"],
+ card["command"],
+ card["explanation"],
+ json.dumps(card["intent"], ensure_ascii=False),
+ json.dumps(card["alternatives"], ensure_ascii=False),
+ json.dumps(card["requires"], ensure_ascii=False),
+ json.dumps(card["packages"], ensure_ascii=False),
+ json.dumps(card["tags"], ensure_ascii=False),
+ json.dumps(card["platform"], ensure_ascii=False),
+ json.dumps(card["shell"], ensure_ascii=False),
+ card["safety"],
+ search_text,
+ ))
+
+
+def upsert_embedding(
+ conn: sqlite3.Connection,
+ card_id: str,
+ model_name: str,
+ vec: np.ndarray,
+) -> None:
+ conn.execute("""
+ INSERT INTO card_embeddings (
+ card_id, model_name, embedding_blob, embedding_dim
+ )
+ VALUES (?, ?, ?, ?)
+ ON CONFLICT(card_id) DO UPDATE SET
+ model_name=excluded.model_name,
+ embedding_blob=excluded.embedding_blob,
+ embedding_dim=excluded.embedding_dim
+ """, (
+ card_id,
+ model_name,
+ serialize_embedding(vec),
+ int(vec.shape[0]),
+ ))
+
+
+def main() -> None:
+ if not DB_PATH.exists():
+ raise FileNotFoundError(
+ f"Database not found at {DB_PATH}. Run scripts/init_db.py first."
+ )
+ if not CARDS_PATH.exists():
+ raise FileNotFoundError(f"Cards file not found at {CARDS_PATH}")
+
+ cards = load_cards(CARDS_PATH)
+
+ model = SentenceTransformer(
+ MODEL_NAME,
+ cache_folder=str(LOCAL_CACHE_DIR.resolve()),
+ local_files_only=True,
+ )
+
+ search_texts = [build_search_text(card) for card in cards]
+ embeddings = model.encode(
+ search_texts,
+ normalize_embeddings=True,
+ convert_to_numpy=True,
+ show_progress_bar=True,
+ )
+
+ conn = sqlite3.connect(DB_PATH)
+ try:
+ conn.execute("PRAGMA foreign_keys=ON;")
+ for card, vec, search_text in zip(cards, embeddings, search_texts):
+ upsert_card(conn, card, search_text)
+ upsert_embedding(conn, card["id"], MODEL_NAME, vec)
+ conn.commit()
+ print(f"Indexed {len(cards)} cards into {DB_PATH}")
+ finally:
+ conn.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/cache_model.py b/cache_model.py
new file mode 100644
index 0000000..6478a50
--- /dev/null
+++ b/cache_model.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+from sentence_transformers import SentenceTransformer
+
+MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+LOCAL_CACHE_DIR = Path("models/hf")
+
+
+def main() -> None:
+ LOCAL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+
+ os.environ.setdefault("HF_HOME", str(LOCAL_CACHE_DIR.resolve()))
+ os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(LOCAL_CACHE_DIR.resolve()))
+
+ print(f"Caching model: {MODEL_NAME}")
+ print(f"Cache dir: {LOCAL_CACHE_DIR.resolve()}")
+
+ model = SentenceTransformer(
+ MODEL_NAME,
+ cache_folder=str(LOCAL_CACHE_DIR.resolve()),
+ )
+
+ # Force an actual encode call so all needed files are loaded.
+ _ = model.encode(["test"], convert_to_numpy=True)
+
+ print("Model cached successfully.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/cards.jsonl b/cards.jsonl
new file mode 100644
index 0000000..ed19844
--- /dev/null
+++ b/cards.jsonl
@@ -0,0 +1,31 @@
+{"id":"glxinfo_opengl_renderer","intent":["show which GPU OpenGL is using","see OpenGL renderer","check OpenGL vendor and renderer","find what graphics device OpenGL is using"],"command":"glxinfo -B","alternatives":["glxinfo | grep \"OpenGL\""],"explanation":"Shows the OpenGL vendor, renderer, and version in a short summary.","requires":["glxinfo"],"packages":{"fedora":["mesa-demos"],"debian":["mesa-utils"], "arch":["mesa-utils"]},"tags":["gpu","opengl","graphics","diagnostics"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"vulkaninfo_gpu","intent":["show which GPU Vulkan is using","see Vulkan driver info","check Vulkan renderer","find Vulkan device information"],"command":"vulkaninfo --summary","alternatives":["vulkaninfo | grep driverName"],"explanation":"Shows a concise summary of Vulkan devices and drivers.","requires":["vulkaninfo"],"packages":{"fedora":["vulkan-tools"],"debian":["vulkan-tools"]},"tags":["gpu","vulkan","graphics","diagnostics"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"lsblk_block_devices","intent":["list disks and partitions","show block devices","see drives and partitions","inspect storage devices"],"command":"lsblk","alternatives":["lsblk -f"],"explanation":"Lists block devices such as disks, partitions, and mount points.","requires":["lsblk"],"packages":{},"tags":["disk","storage","devices"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"df_h_disk_usage","intent":["show disk free space","check filesystem usage","see disk usage by filesystem","how full are my disks"],"command":"df -h","alternatives":[],"explanation":"Shows filesystem size, used space, and available space in human-readable units.","requires":["df"],"packages":{},"tags":["disk","filesystem","space"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"du_sh_directory_size","intent":["show size of current directory","check folder size","how large is this directory","measure directory disk usage"],"command":"du -sh .","alternatives":["du -sh *"],"explanation":"Shows total size of the current directory.","requires":["du"],"packages":{},"tags":["disk","directory","size"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"find_large_files","intent":["find large files","show biggest files under current directory","locate files larger than 500MB","search for huge files"],"command":"find . -type f -size +500M","alternatives":["find /path -type f -size +1G"],"explanation":"Finds files larger than a given size.","requires":["find"],"packages":{},"tags":["find","files","disk","cleanup"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"find_name_case_insensitive","intent":["find a file by name ignoring case","search for filename case insensitive","locate file by partial name","find matching filename"],"command":"find . -iname '*pattern*'","alternatives":[],"explanation":"Searches recursively for files whose names match a case-insensitive pattern.","requires":["find"],"packages":{},"tags":["find","files","search"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"rg_text_recursive","intent":["search text recursively in files","grep through a project","find text in source tree","search contents of files fast"],"command":"rg 'pattern'","alternatives":["grep -R \"pattern\" ."],"explanation":"Searches recursively for text, usually faster and cleaner than grep -R.","requires":["rg"],"packages":{"fedora":["ripgrep"],"debian":["ripgrep"]},"tags":["search","text","grep","files","code"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"ss_listening_ports","intent":["show open listening ports","list listening sockets","see what ports are open locally","check listening network services"],"command":"ss -ltnp","alternatives":["sudo ss -ltnp"],"explanation":"Shows listening TCP sockets and associated processes when permitted.","requires":["ss"],"packages":{},"tags":["network","ports","sockets","diagnostics"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"ss_process_on_port","intent":["find process using port 8080","see what is listening on a port","which program owns port 3000","identify service bound to port"],"command":"ss -ltnp | grep ':8080'","alternatives":["sudo ss -ltnp | grep ':8080'"],"explanation":"Filters listening sockets to the requested port.","requires":["ss","grep"],"packages":{},"tags":["network","ports","process"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"ip_addr_show","intent":["show IP addresses","list network interfaces and addresses","see local IPs","inspect interface addresses"],"command":"ip addr","alternatives":["ip -brief addr"],"explanation":"Shows network interfaces and assigned IP addresses.","requires":["ip"],"packages":{},"tags":["network","ip","interfaces"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"ip_route_show","intent":["show routing table","see default route","inspect routes","what route is this host using"],"command":"ip route","alternatives":[],"explanation":"Displays the kernel routing table, including the default route.","requires":["ip"],"packages":{},"tags":["network","routing"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"ping_basic","intent":["test network reachability","ping a host","check if a machine is reachable","see if DNS and network work"],"command":"ping -c 4 example.com","alternatives":[],"explanation":"Sends a few ICMP echo requests to test reachability and latency.","requires":["ping"],"packages":{},"tags":["network","diagnostics","latency"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"curl_headers","intent":["show HTTP headers","inspect response headers from a URL","check server response headers","see HTTP status and headers"],"command":"curl -I https://example.com","alternatives":["curl -sSI https://example.com"],"explanation":"Fetches only the response headers from a URL.","requires":["curl"],"packages":{},"tags":["http","curl","headers","web"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"curl_download_file","intent":["download a file from a URL","save URL to local file","fetch a file with curl","download file without browser"],"command":"curl -LO https://example.com/file.tar.gz","alternatives":["wget https://example.com/file.tar.gz"],"explanation":"Downloads a file and saves it with the remote name.","requires":["curl"],"packages":{},"tags":["http","download","curl"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"journalctl_boot_errors","intent":["show boot errors","inspect errors from current boot","check systemd errors since boot","look at boot logs"],"command":"journalctl -b -p err","alternatives":["journalctl -b"],"explanation":"Shows error-priority messages from the current boot.","requires":["journalctl"],"packages":{},"tags":["logs","systemd","boot","errors"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"journalctl_service_follow","intent":["follow logs for a systemd service","tail service logs","watch logs from sshd","stream unit logs"],"command":"journalctl -u sshd -f","alternatives":[],"explanation":"Follows logs for a specific systemd unit in real time.","requires":["journalctl"],"packages":{},"tags":["logs","systemd","service"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"ps_grep_process","intent":["find a running process by name","check if process is running","search process list","locate process from command name"],"command":"ps aux | grep '[n]ame'","alternatives":["pgrep -a name"],"explanation":"Searches the process list while avoiding matching the grep process itself.","requires":["ps","grep"],"packages":{},"tags":["process","ps","grep"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"pgrep_full_cmdline","intent":["find process by name with pid","show pid for matching process","lookup running process and command line","find process quickly"],"command":"pgrep -a name","alternatives":[],"explanation":"Prints matching PIDs along with their command lines.","requires":["pgrep"],"packages":{},"tags":["process","pid","lookup"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"free_h_memory","intent":["show memory usage","check RAM usage","how much memory is free","inspect memory consumption summary"],"command":"free -h","alternatives":[],"explanation":"Shows total, used, and available memory in human-readable units.","requires":["free"],"packages":{},"tags":["memory","ram","diagnostics"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"top_overview","intent":["show top processes","inspect CPU usage live","watch system activity","interactive process viewer"],"command":"top","alternatives":["htop"],"explanation":"Shows a live interactive overview of processes and resource usage.","requires":["top"],"packages":{"fedora":["htop"],"debian":["htop"]},"tags":["cpu","memory","process","monitoring"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"chmod_executable","intent":["make a script executable","add execute permission to file","chmod script so it can run directly","set executable bit"],"command":"chmod +x script.sh","alternatives":[],"explanation":"Adds execute permission so the file can be run directly.","requires":["chmod"],"packages":{},"tags":["permissions","chmod","scripts"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-modifying"}
+{"id":"tar_extract_gz","intent":["extract a tar.gz archive","unpack tgz file","untar compressed archive","open tarball"],"command":"tar -xvf archive.tar.gz","alternatives":["tar -xzf archive.tar.gz"],"explanation":"Extracts files from a tar archive; some users prefer -z explicitly for gzip.","requires":["tar"],"packages":{},"tags":["archive","tar","extract"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-modifying"}
+{"id":"tar_create_gz","intent":["create a tar.gz archive","compress a directory into tar.gz","make a tarball","archive files into gzip tar"],"command":"tar -cvzf archive.tar.gz dir/","alternatives":[],"explanation":"Creates a gzip-compressed tar archive from a directory.","requires":["tar"],"packages":{},"tags":["archive","tar","compress"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-modifying"}
+{"id":"systemctl_status_service","intent":["check status of a service","see if systemd service is running","inspect service health","show systemctl status"],"command":"systemctl status sshd","alternatives":["systemctl --no-pager status sshd"],"explanation":"Shows service state, recent logs, and unit metadata.","requires":["systemctl"],"packages":{},"tags":["systemd","service","status"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"systemctl_restart_service","intent":["restart a service","bounce systemd unit","restart sshd","reload service by restarting it"],"command":"sudo systemctl restart sshd","alternatives":["sudo systemctl try-restart sshd"],"explanation":"Restarts a systemd service; usually requires root privileges.","requires":["systemctl","sudo"],"packages":{},"tags":["systemd","service","restart"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"privileged-modifying"}
+{"id":"dnf_which_package_owns_file","intent":["find which package owns a file","what rpm provides this path","determine package for installed file","lookup file owner package"],"command":"rpm -qf /path/to/file","alternatives":["dnf provides /path/to/file"],"explanation":"Shows which installed package owns a given file; dnf provides can search repos too.","requires":["rpm"],"packages":{},"tags":["package","rpm","dnf","files"],"platform":["linux","fedora"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"dnf_search_package","intent":["search for a package in dnf","find package by name or description","lookup package in repositories","search repo packages"],"command":"dnf search keyword","alternatives":[],"explanation":"Searches package metadata in enabled repositories.","requires":["dnf"],"packages":{},"tags":["package","dnf","search"],"platform":["linux","fedora"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"git_status_short","intent":["show git status briefly","check repo state","see changed files in git","short git status"],"command":"git status --short","alternatives":["git status"],"explanation":"Shows tracked and untracked file changes in a compact format.","requires":["git"],"packages":{},"tags":["git","status","repo"],"platform":["linux","macos"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"git_log_oneline_graph","intent":["show concise git history","see commit graph briefly","inspect recent commits compactly","git log one line graph"],"command":"git log --oneline --graph --decorate -n 20","alternatives":[],"explanation":"Shows a compact decorated commit graph for recent history.","requires":["git"],"packages":{},"tags":["git","history","commits"],"platform":["linux","macos"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
+{"id":"git_show_changed_files","intent":["show files changed in last commit","list files changed by commit","see modified files from HEAD","inspect changed paths in commit"],"command":"git show --name-only --oneline HEAD","alternatives":["git diff --name-only HEAD~1 HEAD"],"explanation":"Shows the last commit summary and the files it changed.","requires":["git"],"packages":{},"tags":["git","diff","files","commits"],"platform":["linux","macos"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"}
diff --git a/cheat.db b/cheat.db
new file mode 100644
index 0000000..ce146cf
--- /dev/null
+++ b/cheat.db
Binary files differ
diff --git a/init_db.py b/init_db.py
new file mode 100644
index 0000000..a401a91
--- /dev/null
+++ b/init_db.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+
+DB_PATH = Path("cheat.db")
+
+
+def main() -> None:
+ DB_PATH.parent.mkdir(parents=True, exist_ok=True)
+
+ conn = sqlite3.connect(DB_PATH)
+ try:
+ conn.execute("PRAGMA journal_mode=WAL;")
+ conn.execute("PRAGMA foreign_keys=ON;")
+
+ conn.execute("""
+ CREATE TABLE IF NOT EXISTS cards (
+ id TEXT PRIMARY KEY,
+ command TEXT NOT NULL,
+ explanation TEXT NOT NULL,
+ intent_json TEXT NOT NULL,
+ alternatives_json TEXT NOT NULL,
+ requires_json TEXT NOT NULL,
+ packages_json TEXT NOT NULL,
+ tags_json TEXT NOT NULL,
+ platform_json TEXT NOT NULL,
+ shell_json TEXT NOT NULL,
+ safety TEXT NOT NULL,
+ search_text TEXT NOT NULL,
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+ updated_at TEXT DEFAULT CURRENT_TIMESTAMP
+ );
+ """)
+
+ conn.execute("""
+ CREATE TABLE IF NOT EXISTS card_embeddings (
+ card_id TEXT PRIMARY KEY,
+ model_name TEXT NOT NULL,
+ embedding_blob BLOB NOT NULL,
+ embedding_dim INTEGER NOT NULL,
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+ FOREIGN KEY(card_id) REFERENCES cards(id) ON DELETE CASCADE
+ );
+ """)
+
+ conn.execute("""
+ CREATE INDEX IF NOT EXISTS idx_cards_command
+ ON cards(command);
+ """)
+
+ conn.execute("""
+ CREATE INDEX IF NOT EXISTS idx_cards_safety
+ ON cards(safety);
+ """)
+
+ conn.commit()
+ print(f"Initialized database at {DB_PATH}")
+ finally:
+ conn.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/inspect_db.py b/inspect_db.py
new file mode 100644
index 0000000..0419f26
--- /dev/null
+++ b/inspect_db.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import sqlite3
+from pathlib import Path
+
+DB_PATH = Path("storage/cmdhelp.db")
+
+
+def main() -> None:
+ conn = sqlite3.connect(DB_PATH)
+ try:
+ card_count = conn.execute("SELECT COUNT(*) FROM cards").fetchone()[0]
+ emb_count = conn.execute("SELECT COUNT(*) FROM card_embeddings").fetchone()[0]
+
+ print(f"cards: {card_count}")
+ print(f"embeddings: {emb_count}")
+ print()
+
+ rows = conn.execute("""
+ SELECT id, command, safety
+ FROM cards
+ ORDER BY id
+ LIMIT 10
+ """).fetchall()
+
+ for row in rows:
+ print(row)
+ finally:
+ conn.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/query_index.py b/query_index.py
new file mode 100644
index 0000000..7dee4d3
--- /dev/null
+++ b/query_index.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import os
+from pathlib import Path
+
+DB_PATH = Path("cheat.db")
+
+LOCAL_CACHE_DIR = Path("models/hf")
+os.environ.setdefault("HF_HOME", str(LOCAL_CACHE_DIR.resolve()))
+os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(LOCAL_CACHE_DIR.resolve()))
+
+def deserialize_embedding(blob: bytes, dim: int) -> np.ndarray:
+ vec = np.frombuffer(blob, dtype=np.float32)
+ if vec.shape[0] != dim:
+ raise ValueError(f"Embedding length mismatch: expected {dim}, got {vec.shape[0]}")
+ return vec
+
+
+def load_index(conn: sqlite3.Connection) -> tuple[list[dict[str, Any]], np.ndarray, str]:
+ rows = conn.execute("""
+ SELECT
+ c.id,
+ c.command,
+ c.explanation,
+ c.intent_json,
+ c.alternatives_json,
+ c.requires_json,
+ c.packages_json,
+ c.tags_json,
+ c.platform_json,
+ c.shell_json,
+ c.safety,
+ e.model_name,
+ e.embedding_blob,
+ e.embedding_dim
+ FROM cards c
+ JOIN card_embeddings e ON c.id = e.card_id
+ ORDER BY c.id
+ """).fetchall()
+
+ if not rows:
+ raise RuntimeError("No indexed cards found. Run build_index.py first.")
+
+ cards: list[dict[str, Any]] = []
+ vectors: list[np.ndarray] = []
+ model_name: str | None = None
+
+ for row in rows:
+ (
+ card_id,
+ command,
+ explanation,
+ intent_json,
+ alternatives_json,
+ requires_json,
+ packages_json,
+ tags_json,
+ platform_json,
+ shell_json,
+ safety,
+ row_model_name,
+ embedding_blob,
+ embedding_dim,
+ ) = row
+
+ if model_name is None:
+ model_name = row_model_name
+ elif model_name != row_model_name:
+ raise RuntimeError("Mixed embedding models found in the index.")
+
+ cards.append({
+ "id": card_id,
+ "command": command,
+ "explanation": explanation,
+ "intent": json.loads(intent_json),
+ "alternatives": json.loads(alternatives_json),
+ "requires": json.loads(requires_json),
+ "packages": json.loads(packages_json),
+ "tags": json.loads(tags_json),
+ "platform": json.loads(platform_json),
+ "shell": json.loads(shell_json),
+ "safety": safety,
+ })
+ vectors.append(deserialize_embedding(embedding_blob, embedding_dim))
+
+ matrix = np.vstack(vectors)
+ return cards, matrix, model_name
+
+
+def search(
+ query: str,
+ top_k: int = 5,
+) -> list[dict[str, Any]]:
+ conn = sqlite3.connect(DB_PATH)
+ try:
+ cards, matrix, model_name = load_index(conn)
+ finally:
+ conn.close()
+
+ model = SentenceTransformer(
+ model_name,
+ cache_folder=str(LOCAL_CACHE_DIR.resolve()),
+ local_files_only=True,
+ )
+
+ qvec = model.encode([query], normalize_embeddings=True, convert_to_numpy=True)[0]
+
+ scores = matrix @ qvec
+ top_indices = np.argsort(scores)[::-1][:top_k]
+
+ results: list[dict[str, Any]] = []
+ for idx in top_indices:
+ card = dict(cards[idx])
+ card["score"] = float(scores[idx])
+ results.append(card)
+
+ return results
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description="Query the local command card index.")
+ parser.add_argument("query", type=str, help="Natural language query")
+ parser.add_argument("--top-k", type=int, default=5, help="Number of results to return")
+ args = parser.parse_args()
+
+ results = search(args.query, top_k=args.top_k)
+
+ for i, result in enumerate(results, start=1):
+ print(f"[{i}] score={result['score']:.4f} id={result['id']}")
+ print(f" command: {result['command']}")
+ print(f" explanation: {result['explanation']}")
+ if result["alternatives"]:
+ print(f" alternatives: {', '.join(result['alternatives'])}")
+ print(f" intent: {', '.join(result['intent'][:3])}")
+ print()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..24c6e0b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+sentence-transformers>=3.0.0
+numpy>=1.26.0