From 2f37974a4c84f7ffdd07e2c223eba2d8bd981b61 Mon Sep 17 00:00:00 2001 From: twells46 <173561638+twells46@users.noreply.github.com> Date: Wed, 1 Apr 2026 15:20:50 -0500 Subject: Initial commit --- .gitignore | 2 + README.md | 28 ++++++++ build_index.py | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ cache_model.py | 34 ++++++++++ cards.jsonl | 31 +++++++++ cheat.db | Bin 0 -> 118784 bytes init_db.py | 65 +++++++++++++++++++ inspect_db.py | 34 ++++++++++ query_index.py | 148 ++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 + 10 files changed, 536 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 build_index.py create mode 100644 cache_model.py create mode 100644 cards.jsonl create mode 100644 cheat.db create mode 100644 init_db.py create mode 100644 inspect_db.py create mode 100644 query_index.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b7a10c --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv +models \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..534001e --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# cheat + +Local command-helper retrieval system using JSONL, SQLite, and sentence-transformers. + +## Setup + +```sh +export HF_HOME="$PWD/models/hf" +export SENTENCE_TRANSFORMERS_HOME="$PWD/models/hf" +python -m venv venv +source .venv/bin/activate +pip install -U pip +pip install -r requirements.txt +python scripts/init_db.py +python scripts/build_index.py +``` + +Then run a query like this: + +```sh +python scripts/query_index.py "get free disk space" +``` + +To add commands, add to `./cards.jsonl` and rebuild the index: + +```sh +python scripts/build_index.py +``` diff --git a/build_index.py b/build_index.py new file mode 100644 index 0000000..8597c68 --- /dev/null +++ b/build_index.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import json +import sqlite3 +from pathlib import Path +from typing import Any + +import numpy as np +from sentence_transformers import SentenceTransformer + +import os +from pathlib import Path + +DB_PATH = Path("cheat.db") +CARDS_PATH = Path("./cards.jsonl") +MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" +LOCAL_CACHE_DIR = Path("models/hf") + +LOCAL_CACHE_DIR = Path("models/hf") +os.environ.setdefault("HF_HOME", str(LOCAL_CACHE_DIR.resolve())) +os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(LOCAL_CACHE_DIR.resolve())) + +REQUIRED_FIELDS = { + "id", + "intent", + "command", + "alternatives", + "explanation", + "requires", + "packages", + "tags", + "platform", + "shell", + "safety", +} + + +def load_cards(path: Path) -> list[dict[str, Any]]: + cards: list[dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as f: + for line_no, line in enumerate(f, start=1): + line = line.strip() + if not line: + continue + try: + card = json.loads(line) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON on line {line_no}: {e}") from e + + missing = REQUIRED_FIELDS - set(card.keys()) + if missing: + raise ValueError(f"Missing required fields on line {line_no}: {sorted(missing)}") + + cards.append(card) + return cards + + +def build_search_text(card: dict[str, Any]) -> str: + """ + Build a compact semantic representation for embedding. + This is what the retriever will search over. + """ + parts: list[str] = [] + + intents = card.get("intent", []) + tags = card.get("tags", []) + command = card.get("command", "") + explanation = card.get("explanation", "") + alternatives = card.get("alternatives", []) + requires = card.get("requires", []) + platform = card.get("platform", []) + + if intents: + parts.append("Intents: " + " | ".join(intents)) + if tags: + parts.append("Tags: " + ", ".join(tags)) + if command: + parts.append("Command: " + command) + if alternatives: + parts.append("Alternatives: " + " | ".join(alternatives)) + if explanation: + parts.append("Explanation: " + explanation) + if requires: + parts.append("Requires: " + ", ".join(requires)) + if platform: + parts.append("Platform: " + ", ".join(platform)) + + return "\n".join(parts) + + +def serialize_embedding(vec: np.ndarray) -> bytes: + return vec.astype(np.float32).tobytes() + + +def upsert_card(conn: sqlite3.Connection, card: dict[str, Any], search_text: str) -> None: + conn.execute(""" + INSERT INTO cards ( + id, command, explanation, intent_json, alternatives_json, requires_json, + packages_json, tags_json, platform_json, shell_json, safety, search_text, updated_at + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP) + ON CONFLICT(id) DO UPDATE SET + command=excluded.command, + explanation=excluded.explanation, + intent_json=excluded.intent_json, + alternatives_json=excluded.alternatives_json, + requires_json=excluded.requires_json, + packages_json=excluded.packages_json, + tags_json=excluded.tags_json, + platform_json=excluded.platform_json, + shell_json=excluded.shell_json, + safety=excluded.safety, + search_text=excluded.search_text, + updated_at=CURRENT_TIMESTAMP + """, ( + card["id"], + card["command"], + card["explanation"], + json.dumps(card["intent"], ensure_ascii=False), + json.dumps(card["alternatives"], ensure_ascii=False), + json.dumps(card["requires"], ensure_ascii=False), + json.dumps(card["packages"], ensure_ascii=False), + json.dumps(card["tags"], ensure_ascii=False), + json.dumps(card["platform"], ensure_ascii=False), + json.dumps(card["shell"], ensure_ascii=False), + card["safety"], + search_text, + )) + + +def upsert_embedding( + conn: sqlite3.Connection, + card_id: str, + model_name: str, + vec: np.ndarray, +) -> None: + conn.execute(""" + INSERT INTO card_embeddings ( + card_id, model_name, embedding_blob, embedding_dim + ) + VALUES (?, ?, ?, ?) + ON CONFLICT(card_id) DO UPDATE SET + model_name=excluded.model_name, + embedding_blob=excluded.embedding_blob, + embedding_dim=excluded.embedding_dim + """, ( + card_id, + model_name, + serialize_embedding(vec), + int(vec.shape[0]), + )) + + +def main() -> None: + if not DB_PATH.exists(): + raise FileNotFoundError( + f"Database not found at {DB_PATH}. Run scripts/init_db.py first." + ) + if not CARDS_PATH.exists(): + raise FileNotFoundError(f"Cards file not found at {CARDS_PATH}") + + cards = load_cards(CARDS_PATH) + + model = SentenceTransformer( + MODEL_NAME, + cache_folder=str(LOCAL_CACHE_DIR.resolve()), + local_files_only=True, + ) + + search_texts = [build_search_text(card) for card in cards] + embeddings = model.encode( + search_texts, + normalize_embeddings=True, + convert_to_numpy=True, + show_progress_bar=True, + ) + + conn = sqlite3.connect(DB_PATH) + try: + conn.execute("PRAGMA foreign_keys=ON;") + for card, vec, search_text in zip(cards, embeddings, search_texts): + upsert_card(conn, card, search_text) + upsert_embedding(conn, card["id"], MODEL_NAME, vec) + conn.commit() + print(f"Indexed {len(cards)} cards into {DB_PATH}") + finally: + conn.close() + + +if __name__ == "__main__": + main() diff --git a/cache_model.py b/cache_model.py new file mode 100644 index 0000000..6478a50 --- /dev/null +++ b/cache_model.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +from pathlib import Path + +from sentence_transformers import SentenceTransformer + +MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" +LOCAL_CACHE_DIR = Path("models/hf") + + +def main() -> None: + LOCAL_CACHE_DIR.mkdir(parents=True, exist_ok=True) + + os.environ.setdefault("HF_HOME", str(LOCAL_CACHE_DIR.resolve())) + os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(LOCAL_CACHE_DIR.resolve())) + + print(f"Caching model: {MODEL_NAME}") + print(f"Cache dir: {LOCAL_CACHE_DIR.resolve()}") + + model = SentenceTransformer( + MODEL_NAME, + cache_folder=str(LOCAL_CACHE_DIR.resolve()), + ) + + # Force an actual encode call so all needed files are loaded. + _ = model.encode(["test"], convert_to_numpy=True) + + print("Model cached successfully.") + + +if __name__ == "__main__": + main() diff --git a/cards.jsonl b/cards.jsonl new file mode 100644 index 0000000..ed19844 --- /dev/null +++ b/cards.jsonl @@ -0,0 +1,31 @@ +{"id":"glxinfo_opengl_renderer","intent":["show which GPU OpenGL is using","see OpenGL renderer","check OpenGL vendor and renderer","find what graphics device OpenGL is using"],"command":"glxinfo -B","alternatives":["glxinfo | grep \"OpenGL\""],"explanation":"Shows the OpenGL vendor, renderer, and version in a short summary.","requires":["glxinfo"],"packages":{"fedora":["mesa-demos"],"debian":["mesa-utils"], "arch":["mesa-utils"]},"tags":["gpu","opengl","graphics","diagnostics"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"vulkaninfo_gpu","intent":["show which GPU Vulkan is using","see Vulkan driver info","check Vulkan renderer","find Vulkan device information"],"command":"vulkaninfo --summary","alternatives":["vulkaninfo | grep driverName"],"explanation":"Shows a concise summary of Vulkan devices and drivers.","requires":["vulkaninfo"],"packages":{"fedora":["vulkan-tools"],"debian":["vulkan-tools"]},"tags":["gpu","vulkan","graphics","diagnostics"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"lsblk_block_devices","intent":["list disks and partitions","show block devices","see drives and partitions","inspect storage devices"],"command":"lsblk","alternatives":["lsblk -f"],"explanation":"Lists block devices such as disks, partitions, and mount points.","requires":["lsblk"],"packages":{},"tags":["disk","storage","devices"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"df_h_disk_usage","intent":["show disk free space","check filesystem usage","see disk usage by filesystem","how full are my disks"],"command":"df -h","alternatives":[],"explanation":"Shows filesystem size, used space, and available space in human-readable units.","requires":["df"],"packages":{},"tags":["disk","filesystem","space"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"du_sh_directory_size","intent":["show size of current directory","check folder size","how large is this directory","measure directory disk usage"],"command":"du -sh .","alternatives":["du -sh *"],"explanation":"Shows total size of the current directory.","requires":["du"],"packages":{},"tags":["disk","directory","size"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"find_large_files","intent":["find large files","show biggest files under current directory","locate files larger than 500MB","search for huge files"],"command":"find . -type f -size +500M","alternatives":["find /path -type f -size +1G"],"explanation":"Finds files larger than a given size.","requires":["find"],"packages":{},"tags":["find","files","disk","cleanup"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"find_name_case_insensitive","intent":["find a file by name ignoring case","search for filename case insensitive","locate file by partial name","find matching filename"],"command":"find . -iname '*pattern*'","alternatives":[],"explanation":"Searches recursively for files whose names match a case-insensitive pattern.","requires":["find"],"packages":{},"tags":["find","files","search"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"rg_text_recursive","intent":["search text recursively in files","grep through a project","find text in source tree","search contents of files fast"],"command":"rg 'pattern'","alternatives":["grep -R \"pattern\" ."],"explanation":"Searches recursively for text, usually faster and cleaner than grep -R.","requires":["rg"],"packages":{"fedora":["ripgrep"],"debian":["ripgrep"]},"tags":["search","text","grep","files","code"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"ss_listening_ports","intent":["show open listening ports","list listening sockets","see what ports are open locally","check listening network services"],"command":"ss -ltnp","alternatives":["sudo ss -ltnp"],"explanation":"Shows listening TCP sockets and associated processes when permitted.","requires":["ss"],"packages":{},"tags":["network","ports","sockets","diagnostics"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"ss_process_on_port","intent":["find process using port 8080","see what is listening on a port","which program owns port 3000","identify service bound to port"],"command":"ss -ltnp | grep ':8080'","alternatives":["sudo ss -ltnp | grep ':8080'"],"explanation":"Filters listening sockets to the requested port.","requires":["ss","grep"],"packages":{},"tags":["network","ports","process"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"ip_addr_show","intent":["show IP addresses","list network interfaces and addresses","see local IPs","inspect interface addresses"],"command":"ip addr","alternatives":["ip -brief addr"],"explanation":"Shows network interfaces and assigned IP addresses.","requires":["ip"],"packages":{},"tags":["network","ip","interfaces"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"ip_route_show","intent":["show routing table","see default route","inspect routes","what route is this host using"],"command":"ip route","alternatives":[],"explanation":"Displays the kernel routing table, including the default route.","requires":["ip"],"packages":{},"tags":["network","routing"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"ping_basic","intent":["test network reachability","ping a host","check if a machine is reachable","see if DNS and network work"],"command":"ping -c 4 example.com","alternatives":[],"explanation":"Sends a few ICMP echo requests to test reachability and latency.","requires":["ping"],"packages":{},"tags":["network","diagnostics","latency"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"curl_headers","intent":["show HTTP headers","inspect response headers from a URL","check server response headers","see HTTP status and headers"],"command":"curl -I https://example.com","alternatives":["curl -sSI https://example.com"],"explanation":"Fetches only the response headers from a URL.","requires":["curl"],"packages":{},"tags":["http","curl","headers","web"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"curl_download_file","intent":["download a file from a URL","save URL to local file","fetch a file with curl","download file without browser"],"command":"curl -LO https://example.com/file.tar.gz","alternatives":["wget https://example.com/file.tar.gz"],"explanation":"Downloads a file and saves it with the remote name.","requires":["curl"],"packages":{},"tags":["http","download","curl"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"journalctl_boot_errors","intent":["show boot errors","inspect errors from current boot","check systemd errors since boot","look at boot logs"],"command":"journalctl -b -p err","alternatives":["journalctl -b"],"explanation":"Shows error-priority messages from the current boot.","requires":["journalctl"],"packages":{},"tags":["logs","systemd","boot","errors"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"journalctl_service_follow","intent":["follow logs for a systemd service","tail service logs","watch logs from sshd","stream unit logs"],"command":"journalctl -u sshd -f","alternatives":[],"explanation":"Follows logs for a specific systemd unit in real time.","requires":["journalctl"],"packages":{},"tags":["logs","systemd","service"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"ps_grep_process","intent":["find a running process by name","check if process is running","search process list","locate process from command name"],"command":"ps aux | grep '[n]ame'","alternatives":["pgrep -a name"],"explanation":"Searches the process list while avoiding matching the grep process itself.","requires":["ps","grep"],"packages":{},"tags":["process","ps","grep"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"pgrep_full_cmdline","intent":["find process by name with pid","show pid for matching process","lookup running process and command line","find process quickly"],"command":"pgrep -a name","alternatives":[],"explanation":"Prints matching PIDs along with their command lines.","requires":["pgrep"],"packages":{},"tags":["process","pid","lookup"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"free_h_memory","intent":["show memory usage","check RAM usage","how much memory is free","inspect memory consumption summary"],"command":"free -h","alternatives":[],"explanation":"Shows total, used, and available memory in human-readable units.","requires":["free"],"packages":{},"tags":["memory","ram","diagnostics"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"top_overview","intent":["show top processes","inspect CPU usage live","watch system activity","interactive process viewer"],"command":"top","alternatives":["htop"],"explanation":"Shows a live interactive overview of processes and resource usage.","requires":["top"],"packages":{"fedora":["htop"],"debian":["htop"]},"tags":["cpu","memory","process","monitoring"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"chmod_executable","intent":["make a script executable","add execute permission to file","chmod script so it can run directly","set executable bit"],"command":"chmod +x script.sh","alternatives":[],"explanation":"Adds execute permission so the file can be run directly.","requires":["chmod"],"packages":{},"tags":["permissions","chmod","scripts"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-modifying"} +{"id":"tar_extract_gz","intent":["extract a tar.gz archive","unpack tgz file","untar compressed archive","open tarball"],"command":"tar -xvf archive.tar.gz","alternatives":["tar -xzf archive.tar.gz"],"explanation":"Extracts files from a tar archive; some users prefer -z explicitly for gzip.","requires":["tar"],"packages":{},"tags":["archive","tar","extract"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-modifying"} +{"id":"tar_create_gz","intent":["create a tar.gz archive","compress a directory into tar.gz","make a tarball","archive files into gzip tar"],"command":"tar -cvzf archive.tar.gz dir/","alternatives":[],"explanation":"Creates a gzip-compressed tar archive from a directory.","requires":["tar"],"packages":{},"tags":["archive","tar","compress"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-modifying"} +{"id":"systemctl_status_service","intent":["check status of a service","see if systemd service is running","inspect service health","show systemctl status"],"command":"systemctl status sshd","alternatives":["systemctl --no-pager status sshd"],"explanation":"Shows service state, recent logs, and unit metadata.","requires":["systemctl"],"packages":{},"tags":["systemd","service","status"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"systemctl_restart_service","intent":["restart a service","bounce systemd unit","restart sshd","reload service by restarting it"],"command":"sudo systemctl restart sshd","alternatives":["sudo systemctl try-restart sshd"],"explanation":"Restarts a systemd service; usually requires root privileges.","requires":["systemctl","sudo"],"packages":{},"tags":["systemd","service","restart"],"platform":["linux"],"shell":["bash","zsh","fish"],"safety":"privileged-modifying"} +{"id":"dnf_which_package_owns_file","intent":["find which package owns a file","what rpm provides this path","determine package for installed file","lookup file owner package"],"command":"rpm -qf /path/to/file","alternatives":["dnf provides /path/to/file"],"explanation":"Shows which installed package owns a given file; dnf provides can search repos too.","requires":["rpm"],"packages":{},"tags":["package","rpm","dnf","files"],"platform":["linux","fedora"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"dnf_search_package","intent":["search for a package in dnf","find package by name or description","lookup package in repositories","search repo packages"],"command":"dnf search keyword","alternatives":[],"explanation":"Searches package metadata in enabled repositories.","requires":["dnf"],"packages":{},"tags":["package","dnf","search"],"platform":["linux","fedora"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"git_status_short","intent":["show git status briefly","check repo state","see changed files in git","short git status"],"command":"git status --short","alternatives":["git status"],"explanation":"Shows tracked and untracked file changes in a compact format.","requires":["git"],"packages":{},"tags":["git","status","repo"],"platform":["linux","macos"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"git_log_oneline_graph","intent":["show concise git history","see commit graph briefly","inspect recent commits compactly","git log one line graph"],"command":"git log --oneline --graph --decorate -n 20","alternatives":[],"explanation":"Shows a compact decorated commit graph for recent history.","requires":["git"],"packages":{},"tags":["git","history","commits"],"platform":["linux","macos"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} +{"id":"git_show_changed_files","intent":["show files changed in last commit","list files changed by commit","see modified files from HEAD","inspect changed paths in commit"],"command":"git show --name-only --oneline HEAD","alternatives":["git diff --name-only HEAD~1 HEAD"],"explanation":"Shows the last commit summary and the files it changed.","requires":["git"],"packages":{},"tags":["git","diff","files","commits"],"platform":["linux","macos"],"shell":["bash","zsh","fish"],"safety":"safe-readonly"} diff --git a/cheat.db b/cheat.db new file mode 100644 index 0000000..ce146cf Binary files /dev/null and b/cheat.db differ diff --git a/init_db.py b/init_db.py new file mode 100644 index 0000000..a401a91 --- /dev/null +++ b/init_db.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import sqlite3 +from pathlib import Path + +DB_PATH = Path("cheat.db") + + +def main() -> None: + DB_PATH.parent.mkdir(parents=True, exist_ok=True) + + conn = sqlite3.connect(DB_PATH) + try: + conn.execute("PRAGMA journal_mode=WAL;") + conn.execute("PRAGMA foreign_keys=ON;") + + conn.execute(""" + CREATE TABLE IF NOT EXISTS cards ( + id TEXT PRIMARY KEY, + command TEXT NOT NULL, + explanation TEXT NOT NULL, + intent_json TEXT NOT NULL, + alternatives_json TEXT NOT NULL, + requires_json TEXT NOT NULL, + packages_json TEXT NOT NULL, + tags_json TEXT NOT NULL, + platform_json TEXT NOT NULL, + shell_json TEXT NOT NULL, + safety TEXT NOT NULL, + search_text TEXT NOT NULL, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT DEFAULT CURRENT_TIMESTAMP + ); + """) + + conn.execute(""" + CREATE TABLE IF NOT EXISTS card_embeddings ( + card_id TEXT PRIMARY KEY, + model_name TEXT NOT NULL, + embedding_blob BLOB NOT NULL, + embedding_dim INTEGER NOT NULL, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(card_id) REFERENCES cards(id) ON DELETE CASCADE + ); + """) + + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_cards_command + ON cards(command); + """) + + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_cards_safety + ON cards(safety); + """) + + conn.commit() + print(f"Initialized database at {DB_PATH}") + finally: + conn.close() + + +if __name__ == "__main__": + main() diff --git a/inspect_db.py b/inspect_db.py new file mode 100644 index 0000000..0419f26 --- /dev/null +++ b/inspect_db.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import sqlite3 +from pathlib import Path + +DB_PATH = Path("storage/cmdhelp.db") + + +def main() -> None: + conn = sqlite3.connect(DB_PATH) + try: + card_count = conn.execute("SELECT COUNT(*) FROM cards").fetchone()[0] + emb_count = conn.execute("SELECT COUNT(*) FROM card_embeddings").fetchone()[0] + + print(f"cards: {card_count}") + print(f"embeddings: {emb_count}") + print() + + rows = conn.execute(""" + SELECT id, command, safety + FROM cards + ORDER BY id + LIMIT 10 + """).fetchall() + + for row in rows: + print(row) + finally: + conn.close() + + +if __name__ == "__main__": + main() diff --git a/query_index.py b/query_index.py new file mode 100644 index 0000000..7dee4d3 --- /dev/null +++ b/query_index.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import sqlite3 +from pathlib import Path +from typing import Any + +import numpy as np +from sentence_transformers import SentenceTransformer +import os +from pathlib import Path + +DB_PATH = Path("cheat.db") + +LOCAL_CACHE_DIR = Path("models/hf") +os.environ.setdefault("HF_HOME", str(LOCAL_CACHE_DIR.resolve())) +os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(LOCAL_CACHE_DIR.resolve())) + +def deserialize_embedding(blob: bytes, dim: int) -> np.ndarray: + vec = np.frombuffer(blob, dtype=np.float32) + if vec.shape[0] != dim: + raise ValueError(f"Embedding length mismatch: expected {dim}, got {vec.shape[0]}") + return vec + + +def load_index(conn: sqlite3.Connection) -> tuple[list[dict[str, Any]], np.ndarray, str]: + rows = conn.execute(""" + SELECT + c.id, + c.command, + c.explanation, + c.intent_json, + c.alternatives_json, + c.requires_json, + c.packages_json, + c.tags_json, + c.platform_json, + c.shell_json, + c.safety, + e.model_name, + e.embedding_blob, + e.embedding_dim + FROM cards c + JOIN card_embeddings e ON c.id = e.card_id + ORDER BY c.id + """).fetchall() + + if not rows: + raise RuntimeError("No indexed cards found. Run build_index.py first.") + + cards: list[dict[str, Any]] = [] + vectors: list[np.ndarray] = [] + model_name: str | None = None + + for row in rows: + ( + card_id, + command, + explanation, + intent_json, + alternatives_json, + requires_json, + packages_json, + tags_json, + platform_json, + shell_json, + safety, + row_model_name, + embedding_blob, + embedding_dim, + ) = row + + if model_name is None: + model_name = row_model_name + elif model_name != row_model_name: + raise RuntimeError("Mixed embedding models found in the index.") + + cards.append({ + "id": card_id, + "command": command, + "explanation": explanation, + "intent": json.loads(intent_json), + "alternatives": json.loads(alternatives_json), + "requires": json.loads(requires_json), + "packages": json.loads(packages_json), + "tags": json.loads(tags_json), + "platform": json.loads(platform_json), + "shell": json.loads(shell_json), + "safety": safety, + }) + vectors.append(deserialize_embedding(embedding_blob, embedding_dim)) + + matrix = np.vstack(vectors) + return cards, matrix, model_name + + +def search( + query: str, + top_k: int = 5, +) -> list[dict[str, Any]]: + conn = sqlite3.connect(DB_PATH) + try: + cards, matrix, model_name = load_index(conn) + finally: + conn.close() + + model = SentenceTransformer( + model_name, + cache_folder=str(LOCAL_CACHE_DIR.resolve()), + local_files_only=True, + ) + + qvec = model.encode([query], normalize_embeddings=True, convert_to_numpy=True)[0] + + scores = matrix @ qvec + top_indices = np.argsort(scores)[::-1][:top_k] + + results: list[dict[str, Any]] = [] + for idx in top_indices: + card = dict(cards[idx]) + card["score"] = float(scores[idx]) + results.append(card) + + return results + + +def main() -> None: + parser = argparse.ArgumentParser(description="Query the local command card index.") + parser.add_argument("query", type=str, help="Natural language query") + parser.add_argument("--top-k", type=int, default=5, help="Number of results to return") + args = parser.parse_args() + + results = search(args.query, top_k=args.top_k) + + for i, result in enumerate(results, start=1): + print(f"[{i}] score={result['score']:.4f} id={result['id']}") + print(f" command: {result['command']}") + print(f" explanation: {result['explanation']}") + if result["alternatives"]: + print(f" alternatives: {', '.join(result['alternatives'])}") + print(f" intent: {', '.join(result['intent'][:3])}") + print() + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..24c6e0b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +sentence-transformers>=3.0.0 +numpy>=1.26.0 -- cgit v1.2.3