From d2c48de60da57ab9736168aa608c506078c4b5d2 Mon Sep 17 00:00:00 2001 From: Gauvino Date: Mon, 1 Jun 2026 17:37:22 +0200 Subject: [PATCH] ci(issues): flag likely-duplicate issues on open Adds .github/workflows/detect-duplicate.yml + scripts/detect-duplicate-issue.mjs (Bun, dep-free, no API key): on a new issue, compares its title/body to open issues via Jaccard similarity (with light stemming and stop-words), and if the top matches pass a threshold, posts one comment listing them and adds a 'possible duplicate' label. Inspired by seerr's detect-duplicate, minus the embedding/Groq dependency. --- .github/workflows/detect-duplicate.yml | 38 +++++ scripts/detect-duplicate-issue.mjs | 192 +++++++++++++++++++++++++ 2 files changed, 230 insertions(+) create mode 100644 .github/workflows/detect-duplicate.yml create mode 100644 scripts/detect-duplicate-issue.mjs diff --git a/.github/workflows/detect-duplicate.yml b/.github/workflows/detect-duplicate.yml new file mode 100644 index 000000000..09aa23565 --- /dev/null +++ b/.github/workflows/detect-duplicate.yml @@ -0,0 +1,38 @@ +name: πŸ” Detect Duplicate Issues + +on: + issues: + types: [opened] + +permissions: + contents: read + +concurrency: + group: detect-duplicate-${{ github.event.issue.number }} + cancel-in-progress: true + +jobs: + detect: + name: πŸ” Find similar issues + if: github.actor != 'github-actions[bot]' + runs-on: ubuntu-24.04 + permissions: + issues: write + contents: read + steps: + - name: πŸ“₯ Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: 🍞 Setup Bun + uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2.2.0 + with: + bun-version: latest + + - name: πŸ” Detect duplicate issues + run: bun scripts/detect-duplicate-issue.mjs + env: + GH_TOKEN: ${{ github.token }} + GITHUB_REPOSITORY: ${{ github.repository }} + ISSUE_NUMBER: ${{ github.event.issue.number }} + ISSUE_TITLE: ${{ github.event.issue.title }} + ISSUE_BODY: ${{ github.event.issue.body }} diff --git a/scripts/detect-duplicate-issue.mjs b/scripts/detect-duplicate-issue.mjs new file mode 100644 index 000000000..c951bcb7b --- /dev/null +++ b/scripts/detect-duplicate-issue.mjs @@ -0,0 +1,192 @@ +#!/usr/bin/env bun +/** + * Flags likely-duplicate issues when a new issue is opened, using lexical similarity + * (Jaccard over word sets of the title and body) β€” no API key, no embeddings. + * + * On a match it posts ONE comment listing the closest open issues and adds the + * "possible duplicate" label. If nothing is similar enough, it does nothing. + * + * Env: + * GITHUB_REPOSITORY owner/repo + * ISSUE_NUMBER the new issue number + * ISSUE_TITLE the new issue title + * ISSUE_BODY the new issue body + * GH_TOKEN/GITHUB_TOKEN for gh (provided in CI) + * DUP_THRESHOLD similarity threshold 0..1 (default 0.3) + * DUP_MAX max matches to report (default 5) + * DUP_FIXTURE optional path to a JSON array of {number,title,body} (local testing) + * DRY_RUN if set, print results instead of commenting/labelling + */ + +import { execFileSync } from "node:child_process"; +import { readFileSync } from "node:fs"; + +const REPO = process.env.GITHUB_REPOSITORY || "streamyfin/streamyfin"; +const NUMBER = Number(process.env.ISSUE_NUMBER); +const TITLE = process.env.ISSUE_TITLE || ""; +const BODY = process.env.ISSUE_BODY || ""; +const THRESHOLD = Number(process.env.DUP_THRESHOLD) || 0.3; +const MAX = Number(process.env.DUP_MAX) || 5; +const DRY = !!process.env.DRY_RUN; +const LABEL = "possible duplicate"; + +// Generic stop words only β€” keep domain/feature/platform words (android, downloads, +// subtitles…) since those are exactly what makes two reports the same or different. +const STOP = new Set( + ( + "a an the and or but if then of to in on at by for with from as is are was were be been being do does did " + + "it its this that these those i you we they me my your our their he she him her " + + "when while where what which who how why so just then than too very can could would should will " + + "not no nor only own same s t don dont im ive please thanks hi hello also still get got use used using " + + "app application streamyfin issue bug" + ).split(/\s+/), +); + +const stem = (w) => w.replace(/(ing|ed|es|s)$/, ""); + +const tokens = (s) => + (s || "") + .toLowerCase() + .replace(/```[\s\S]*?```/g, " ") // drop code blocks + .replace(//g, " ") // drop html comments + .replace(/https?:\/\/\S+/g, " ") // drop urls + .replace(/[^a-z0-9\s]/g, " ") + .split(/\s+/) + .filter((w) => w.length > 2 && !STOP.has(w)) + .map(stem) + .filter((w) => w.length > 2); + +const jaccard = (a, b) => { + const A = new Set(a); + const B = new Set(b); + if (!A.size || !B.size) return 0; + let inter = 0; + for (const x of A) if (B.has(x)) inter++; + return inter / (A.size + B.size - inter); +}; + +const newTitle = tokens(TITLE); +const newBody = tokens(BODY); +const score = (o) => + 0.6 * jaccard(newTitle, tokens(o.title)) + + 0.4 * jaccard(newBody, tokens(o.body)); + +// fetch open issues (excluding PRs and the new issue itself) +let issues; +if (process.env.DUP_FIXTURE) { + issues = JSON.parse(readFileSync(process.env.DUP_FIXTURE, "utf8")); +} else { + const raw = execFileSync( + "gh", + [ + "api", + `repos/${REPO}/issues`, + "--paginate", + "-X", + "GET", + "-f", + "state=open", + "-f", + "per_page=100", + "--jq", + ".[] | select(.pull_request | not) | {number, title, body}", + ], + { encoding: "utf8", maxBuffer: 1e8 }, + ); + issues = raw + .split("\n") + .filter(Boolean) + .map((l) => JSON.parse(l)); +} + +const matches = issues + .filter((o) => o.number !== NUMBER) + .map((o) => ({ ...o, s: score(o) })) + .filter((o) => o.s >= THRESHOLD) + .sort((a, b) => b.s - a.s) + .slice(0, MAX); + +if (!matches.length) { + console.log("No likely duplicates found."); + process.exit(0); +} + +const list = matches + .map( + (m) => + `- #${m.number} β€” ${m.title} _(β‰ˆ ${Math.round(m.s * 100)}% similar)_`, + ) + .join("\n"); +const comment = [ + "", + "πŸ” **This looks like it might be a duplicate.** Possibly related open issues:", + "", + list, + "", + "If yours is different, ignore this β€” a maintainer will confirm. Otherwise, please πŸ‘ the existing issue and add any extra details there.", +].join("\n"); + +console.log(`Found ${matches.length} possible duplicate(s):\n${list}`); + +if (DRY) { + console.log("\nDRY_RUN: not commenting/labelling."); + process.exit(0); +} + +execFileSync( + "gh", + [ + "api", + "-X", + "POST", + `repos/${REPO}/issues/${NUMBER}/comments`, + "-f", + `body=${comment}`, + ], + { stdio: "ignore" }, +); +try { + execFileSync( + "gh", + [ + "api", + "-X", + "POST", + `repos/${REPO}/issues/${NUMBER}/labels`, + "-f", + `labels[]=${LABEL}`, + ], + { stdio: "ignore" }, + ); +} catch { + // label may not exist yet β€” create then add + execFileSync( + "gh", + [ + "api", + "-X", + "POST", + `repos/${REPO}/labels`, + "-f", + `name=${LABEL}`, + "-f", + "color=fbca04", + "-f", + "description=Automatically flagged as a possible duplicate", + ], + { stdio: "ignore" }, + ); + execFileSync( + "gh", + [ + "api", + "-X", + "POST", + `repos/${REPO}/issues/${NUMBER}/labels`, + "-f", + `labels[]=${LABEL}`, + ], + { stdio: "ignore" }, + ); +} +console.log("Commented and labelled.");