feat(vision): enhance quad detection with Hough lines and multi-strategy preprocessing

- Add Hough line detection for improved edge finding with finger occlusion
- Implement multi-strategy preprocessing (standard, enhanced, adaptive, multi)
- Add configurable parameters for Canny thresholds, adaptive threshold, morph gradient
- Refactor useDocumentDetection hook with cleaner API
- Add OpenCV type definitions and async loading improvements
- Add loader test pages for debugging OpenCV initialization
- Add quad-test page for interactive detection testing
- Add document detection research notes

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Thomas Hallock 2026-01-12 11:16:06 -06:00
parent bc02ba281d
commit 93a25c1e7b
29 changed files with 3091 additions and 637 deletions

View File

@ -641,7 +641,16 @@
"Bash(if [ -f /Users/antialias/projects/soroban-abacus-flashcards/apps/web/data/vision-training/collected/.deleted ])",
"Bash(then wc -l /Users/antialias/projects/soroban-abacus-flashcards/apps/web/data/vision-training/collected/.deleted)",
"Bash(else echo \"File does not exist\")",
"Bash(fi)"
"Bash(fi)",
"WebFetch(domain:docs.opencv.org)",
"mcp__chrome-devtools__new_page",
"mcp__chrome-devtools__close_page",
"WebFetch(domain:www.npmjs.com)",
"Bash(git branch:*)",
"WebFetch(domain:scanbot.io)",
"WebFetch(domain:learnopencv.com)",
"WebFetch(domain:news.ycombinator.com)",
"Bash(npm run typecheck:*)"
],
"deny": [],
"ask": []

View File

@ -339,8 +339,8 @@ export function NavSyncIndicator({ sync }: NavSyncIndicatorProps) {
</div>
{sync.status?.local && sync.status?.remote && (
<div className={css({ color: 'gray.400', fontSize: 'xs' })}>
Local: {sync.status.local.totalImages?.toLocaleString() || 0} {' '}
Remote: {sync.status.remote.totalImages?.toLocaleString() || 0}
Local: {sync.status.local.totalImages?.toLocaleString() || 0} Remote:{' '}
{sync.status.remote.totalImages?.toLocaleString() || 0}
</div>
)}
</div>

View File

@ -118,7 +118,9 @@ export function useSyncStatus(modelType: ModelType): UseSyncStatusResult {
const refreshHistory = useCallback(async () => {
setHistoryLoading(true)
try {
const response = await fetch(`/api/vision-training/sync/history?modelType=${modelType}&limit=5`)
const response = await fetch(
`/api/vision-training/sync/history?modelType=${modelType}&limit=5`
)
if (response.ok) {
const data = await response.json()
setHistory(data.history || [])

View File

@ -77,7 +77,8 @@ export default function LoaderTestAsyncPage() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -80,7 +80,8 @@ export default function LoaderTestBarePage() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -32,9 +32,7 @@ export default function LoaderTestCheckPage() {
gap: 4,
})}
>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>
Check window.cv (No Loading)
</h1>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>Check window.cv (No Loading)</h1>
<p className={css({ color: 'gray.400', mb: 4 })}>
Just checks if window.cv exists - no loading.
</p>

View File

@ -47,9 +47,7 @@ export default function LoaderTestDirectPage() {
gap: 4,
})}
>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>
Direct Import Test
</h1>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>Direct Import Test</h1>
<p className={css({ color: 'gray.400', mb: 4 })}>
Imports directly from loader.ts (not barrel index.ts).
</p>
@ -80,7 +78,8 @@ export default function LoaderTestDirectPage() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -49,9 +49,7 @@ export default function LoaderTestHookCustomPage() {
gap: 4,
})}
>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>
Custom Hook Test
</h1>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>Custom Hook Test</h1>
<p className={css({ color: 'gray.400', mb: 4 })}>
Uses custom useOpenCV hook from separate file.
</p>
@ -82,7 +80,8 @@ export default function LoaderTestHookCustomPage() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -82,7 +82,8 @@ export default function LoaderTestHookPage() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -132,9 +132,7 @@ export default function LoaderTestInlinePage() {
gap: 4,
})}
>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>
Inline Loader Test
</h1>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>Inline Loader Test</h1>
<p className={css({ color: 'gray.400', mb: 4 })}>
Loader code is INLINE in this component (not imported from module).
</p>
@ -165,7 +163,8 @@ export default function LoaderTestInlinePage() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -41,9 +41,7 @@ export default function LoaderTestScriptPage() {
gap: 4,
})}
>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>
Script Tag Test (No Waiting)
</h1>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>Script Tag Test (No Waiting)</h1>
<p className={css({ color: 'gray.400', mb: 4 })}>
Step 1: Add script tag. Step 2: Check if cv loaded.
</p>

View File

@ -79,7 +79,8 @@ export default function LoaderTestSimplePage() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -43,9 +43,7 @@ export default function LoaderTestV2Page() {
gap: 4,
})}
>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>
Loader V2 Test
</h1>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>Loader V2 Test</h1>
<p className={css({ color: 'gray.400', mb: 4 })}>
Uses new loaderV2.ts with proven working pattern.
</p>
@ -76,7 +74,8 @@ export default function LoaderTestV2Page() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -74,7 +74,8 @@ export default function LoaderTestV3Page() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -74,7 +74,8 @@ export default function LoaderTestV4Page() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -44,9 +44,7 @@ export default function LoaderTestV5Page() {
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>
Loader V5 Test (No Internal Await)
</h1>
<p className={css({ color: 'gray.400', mb: 4 })}>
Returns Promise, consumer awaits it.
</p>
<p className={css({ color: 'gray.400', mb: 4 })}>Returns Promise, consumer awaits it.</p>
<button
type="button"
@ -74,7 +72,8 @@ export default function LoaderTestV5Page() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -46,9 +46,7 @@ export default function LoaderTestWaitPage() {
gap: 4,
})}
>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>
Wait Test (Imported Promise)
</h1>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>Wait Test (Imported Promise)</h1>
<p className={css({ color: 'gray.400', mb: 4 })}>
Adds script tag then waits with imported Promise function.
</p>
@ -79,7 +77,8 @@ export default function LoaderTestWaitPage() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -51,9 +51,7 @@ export default function LoaderTestWrappedPage() {
gap: 4,
})}
>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>
Wrapped Import Test
</h1>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>Wrapped Import Test</h1>
<p className={css({ color: 'gray.400', mb: 4 })}>
Imported loadOpenCV wrapped in useCallback.
</p>
@ -84,7 +82,8 @@ export default function LoaderTestWrappedPage() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

View File

@ -47,9 +47,7 @@ export default function LoaderTestPage() {
gap: 4,
})}
>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>
Minimal OpenCV Loader Test
</h1>
<h1 className={css({ fontSize: '2xl', fontWeight: 'bold' })}>Minimal OpenCV Loader Test</h1>
<p className={css({ color: 'gray.400', mb: 4 })}>
This page ONLY imports the standalone loader. No useDocumentDetection.
</p>
@ -80,7 +78,8 @@ export default function LoaderTestPage() {
Status:{' '}
<span
className={css({
color: status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
color:
status === 'success' ? 'green.400' : status === 'error' ? 'red.400' : 'gray.400',
})}
>
{status}

File diff suppressed because it is too large Load Diff

View File

@ -534,7 +534,9 @@ export function UnifiedDataPanel({ modelType, onDataChanged }: UnifiedDataPanelP
const handleStartSync = useCallback(async () => {
setSyncProgress({ phase: 'connecting', message: 'Connecting...' })
try {
const response = await fetch(`/api/vision-training/sync?modelType=${modelType}`, { method: 'POST' })
const response = await fetch(`/api/vision-training/sync?modelType=${modelType}`, {
method: 'POST',
})
if (response.ok) {
setSyncProgress({ phase: 'complete', message: 'Sync complete!' })
loadItems()

View File

@ -1,129 +1,33 @@
'use client'
import { useCallback, useRef, useState } from 'react'
import { useCallback, useEffect, useRef, useState } from 'react'
import {
createQuadDetector,
type DetectedQuad as ModularDetectedQuad,
type QuadDetectorConfig,
} from '@/lib/vision/quadDetector'
import { createQuadTracker, type TrackedQuad as ModularTrackedQuad } from '@/lib/vision/quadTracker'
import type { CV, CVMat } from '@/lib/vision/opencv/types'
// Re-export config type for consumers
export type { QuadDetectorConfig } from '@/lib/vision/quadDetector'
/**
* Hook for document detection using OpenCV.js directly
*
* Features:
* - Lazy loads OpenCV.js (~8MB) only when first used
* - Multi-quad tracking: detects ALL quadrilaterals, not just the largest
* - Uses modular quadDetector and quadTracker from @/lib/vision
* - Scores quads by: size, aspect ratio, and temporal stability
* - Filters out small quads (likely printed on page) vs page-sized quads
* - Provides highlightDocument for drawing detected quad on overlay
* - Provides extractDocument for cropping/deskewing captured image
*/
// OpenCV.js types (minimal interface for what we use)
interface CVMat {
delete: () => void
data32S: Int32Array
rows: number
cols: number
}
interface CVMatVector {
size: () => number
get: (i: number) => CVMat
delete: () => void
}
interface CVSize {
width: number
height: number
}
interface CVPoint {
x: number
y: number
}
interface CV {
Mat: new () => CVMat
MatVector: new () => CVMatVector
Size: new (w: number, h: number) => CVSize
Scalar: new (r?: number, g?: number, b?: number, a?: number) => unknown
imread: (canvas: HTMLCanvasElement) => CVMat
imshow: (canvas: HTMLCanvasElement, mat: CVMat) => void
cvtColor: (src: CVMat, dst: CVMat, code: number) => void
GaussianBlur: (
src: CVMat,
dst: CVMat,
size: CVSize,
sigmaX: number,
sigmaY: number,
borderType: number
) => void
Canny: (src: CVMat, dst: CVMat, t1: number, t2: number) => void
dilate: (src: CVMat, dst: CVMat, kernel: CVMat, anchor: CVPoint, iterations: number) => void
findContours: (
src: CVMat,
contours: CVMatVector,
hierarchy: CVMat,
mode: number,
method: number
) => void
contourArea: (contour: CVMat) => number
arcLength: (contour: CVMat, closed: boolean) => number
approxPolyDP: (contour: CVMat, approx: CVMat, epsilon: number, closed: boolean) => void
getPerspectiveTransform: (src: CVMat, dst: CVMat) => CVMat
warpPerspective: (
src: CVMat,
dst: CVMat,
M: CVMat,
size: CVSize,
flags: number,
borderMode: number,
borderValue: unknown
) => void
warpAffine: (
src: CVMat,
dst: CVMat,
M: CVMat,
size: CVSize,
flags?: number,
borderMode?: number,
borderValue?: unknown
) => void
getRotationMatrix2D: (center: CVPoint, angle: number, scale: number) => CVMat
rotate: (src: CVMat, dst: CVMat, rotateCode: number) => void
countNonZero: (src: CVMat) => number
matFromArray: (rows: number, cols: number, type: number, data: number[]) => CVMat
COLOR_RGBA2GRAY: number
BORDER_DEFAULT: number
RETR_LIST: number
CHAIN_APPROX_SIMPLE: number
CV_32FC2: number
INTER_LINEAR: number
BORDER_CONSTANT: number
ROTATE_90_CLOCKWISE: number
ROTATE_180: number
ROTATE_90_COUNTERCLOCKWISE: number
}
/** Represents a detected quadrilateral with corner points */
interface DetectedQuad {
corners: Array<{ x: number; y: number }>
area: number
aspectRatio: number
// Unique ID based on approximate center position
centerId: string
}
/** Tracked quad candidate with history */
interface TrackedQuad {
id: string
corners: Array<{ x: number; y: number }>
area: number
aspectRatio: number
/** How many frames this quad has been seen */
frameCount: number
/** Last frame number when this quad was seen */
lastSeenFrame: number
/** Stability score based on corner consistency */
stabilityScore: number
/** History of corner positions for stability calculation */
cornerHistory: Array<Array<{ x: number; y: number }>>
/** Internal tracked quad type for backward compatibility */
interface TrackedQuad extends ModularTrackedQuad {
/** History of corner positions for stability calculation (used by extractDocument) */
cornerHistory?: Array<Array<{ x: number; y: number }>>
}
export interface DocumentDetectionDebugInfo {
@ -143,28 +47,12 @@ export interface DocumentDetectionDebugInfo {
lastDetectionError: string | null
}
/** Number of frames to track quad history */
const HISTORY_LENGTH = 10
/** Minimum frames a quad must be seen to be considered stable */
const MIN_FRAMES_FOR_STABLE = 3
/** Minimum frames for "locked" state */
const LOCKED_FRAME_COUNT = 5
/** Maximum distance (as % of frame diagonal) for quads to be considered "same" */
const QUAD_MATCH_THRESHOLD = 0.08
/** Minimum area as % of frame for a quad to be considered page-sized */
const MIN_AREA_RATIO = 0.15
/** Maximum area as % of frame (filter out frame edges detected as quad) */
const MAX_AREA_RATIO = 0.95
/** Expected aspect ratios for documents (width/height) */
const EXPECTED_ASPECT_RATIOS = [
8.5 / 11, // US Letter portrait
11 / 8.5, // US Letter landscape
1 / Math.sqrt(2), // A4 portrait
Math.sqrt(2), // A4 landscape
1, // Square
]
/** How close aspect ratio must be to expected (tolerance) */
const ASPECT_RATIO_TOLERANCE = 0.3
/** Minimum stability score for locked state */
const MIN_STABILITY_FOR_LOCKED = 0.5
export interface DetectQuadsInImageResult {
/** Whether a document quad was detected */
@ -226,18 +114,44 @@ export interface UseDocumentDetectionReturn {
* Returns the canvas, or null if loading failed
*/
loadImageToCanvas: (file: File) => Promise<HTMLCanvasElement | null>
/**
* Reset all tracking state (call when returning from adjustment mode)
*/
resetTracking: () => void
/**
* Update detector configuration (recreates detector with new settings)
*/
updateDetectorConfig: (config: Partial<QuadDetectorConfig>) => void
/**
* Current detector configuration
*/
detectorConfig: Partial<QuadDetectorConfig>
}
export function useDocumentDetection(): UseDocumentDetectionReturn {
export interface UseDocumentDetectionOptions {
/** Initial detector configuration */
detectorConfig?: Partial<QuadDetectorConfig>
}
export function useDocumentDetection(
options?: UseDocumentDetectionOptions
): UseDocumentDetectionReturn {
// Start with isLoading=false since we won't load until requested
const [isLoading, setIsLoading] = useState(false)
const [error, setError] = useState<string | null>(null)
const cvRef = useRef<CV | null>(null)
const loadPromiseRef = useRef<Promise<void> | null>(null)
// Multi-quad tracking
const trackedQuadsRef = useRef<Map<string, TrackedQuad>>(new Map())
const frameCountRef = useRef(0)
// Detector configuration (can be updated dynamically)
const [detectorConfig, setDetectorConfig] = useState<Partial<QuadDetectorConfig>>(
options?.detectorConfig ?? {}
)
// Modular detector and tracker (created after OpenCV loads)
const detectorRef = useRef<ReturnType<typeof createQuadDetector> | null>(null)
const trackerRef = useRef<ReturnType<typeof createQuadTracker> | null>(null)
// Best quad tracking
const bestQuadRef = useRef<TrackedQuad | null>(null)
const lastStableFrameRef = useRef<HTMLCanvasElement | null>(null)
@ -348,6 +262,15 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
// Store OpenCV reference
cvRef.current = (window as unknown as { cv: CV }).cv
// Create modular detector and tracker with current config
detectorRef.current = createQuadDetector(cvRef.current, detectorConfig)
trackerRef.current = createQuadTracker({
minFramesForStable: MIN_FRAMES_FOR_STABLE,
minFramesForLocked: LOCKED_FRAME_COUNT,
minStabilityForLocked: MIN_STABILITY_FOR_LOCKED,
})
const loadTime = Date.now() - loadStartTimeRef.current
setDebugInfo((prev) => ({ ...prev, loadTimeMs: loadTime }))
setIsLoading(false)
@ -365,7 +288,19 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
} catch {
return false
}
}, [isOpenCVReady])
}, [isOpenCVReady, detectorConfig])
// Recreate detector when config changes (if OpenCV is already loaded)
useEffect(() => {
if (cvRef.current && detectorRef.current) {
detectorRef.current = createQuadDetector(cvRef.current, detectorConfig)
}
}, [detectorConfig])
// Update detector config function
const updateDetectorConfig = useCallback((newConfig: Partial<QuadDetectorConfig>) => {
setDetectorConfig((prev) => ({ ...prev, ...newConfig }))
}, [])
// Reusable canvas for video frame capture
const frameCanvasRef = useRef<HTMLCanvasElement | null>(null)
@ -391,7 +326,7 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
return frameCanvas
}, [])
// Calculate distance between two points
// Calculate distance between two points (kept for extractDocument)
const distance = useCallback(
(p1: { x: number; y: number }, p2: { x: number; y: number }): number => {
return Math.sqrt((p1.x - p2.x) ** 2 + (p1.y - p2.y) ** 2)
@ -399,298 +334,6 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
[]
)
// Order corners: top-left, top-right, bottom-right, bottom-left
const orderCorners = useCallback(
(corners: Array<{ x: number; y: number }>): Array<{ x: number; y: number }> => {
if (corners.length !== 4) return corners
// Find centroid
const cx = corners.reduce((s, c) => s + c.x, 0) / 4
const cy = corners.reduce((s, c) => s + c.y, 0) / 4
// Sort by angle from centroid
const sorted = [...corners].sort((a, b) => {
const angleA = Math.atan2(a.y - cy, a.x - cx)
const angleB = Math.atan2(b.y - cy, b.x - cx)
return angleA - angleB
})
// Find top-left (smallest x+y)
let topLeftIdx = 0
let minSum = Infinity
for (let i = 0; i < 4; i++) {
const sum = sorted[i].x + sorted[i].y
if (sum < minSum) {
minSum = sum
topLeftIdx = i
}
}
// Rotate array so top-left is first
const ordered = []
for (let i = 0; i < 4; i++) {
ordered.push(sorted[(topLeftIdx + i) % 4])
}
return ordered
},
[]
)
// Check if aspect ratio is document-like
const isDocumentAspectRatio = useCallback((ratio: number): boolean => {
return EXPECTED_ASPECT_RATIOS.some(
(expected) => Math.abs(ratio - expected) < ASPECT_RATIO_TOLERANCE
)
}, [])
// Generate a stable ID for a quad based on its center position
const getQuadCenterId = useCallback(
(corners: Array<{ x: number; y: number }>, frameWidth: number, frameHeight: number): string => {
const cx = corners.reduce((s, c) => s + c.x, 0) / 4
const cy = corners.reduce((s, c) => s + c.y, 0) / 4
// Quantize to grid cells (10x10 grid)
const gridX = Math.floor((cx / frameWidth) * 10)
const gridY = Math.floor((cy / frameHeight) * 10)
return `${gridX},${gridY}`
},
[]
)
// Check if two quads are similar (same document)
const quadsMatch = useCallback(
(
q1: Array<{ x: number; y: number }>,
q2: Array<{ x: number; y: number }>,
frameDiagonal: number
): boolean => {
const threshold = frameDiagonal * QUAD_MATCH_THRESHOLD
let totalDist = 0
for (let i = 0; i < 4; i++) {
totalDist += distance(q1[i], q2[i])
}
return totalDist / 4 < threshold
},
[distance]
)
// Calculate corner stability (how much corners move between frames)
const calculateCornerStability = useCallback(
(history: Array<Array<{ x: number; y: number }>>): number => {
if (history.length < 2) return 0
let totalVariance = 0
for (let corner = 0; corner < 4; corner++) {
const xs = history.map((h) => h[corner].x)
const ys = history.map((h) => h[corner].y)
const meanX = xs.reduce((a, b) => a + b, 0) / xs.length
const meanY = ys.reduce((a, b) => a + b, 0) / ys.length
const varX = xs.reduce((a, b) => a + (b - meanX) ** 2, 0) / xs.length
const varY = ys.reduce((a, b) => a + (b - meanY) ** 2, 0) / ys.length
totalVariance += Math.sqrt(varX + varY)
}
// Convert variance to stability score (lower variance = higher stability)
// Normalize: variance of 0 = stability 1, variance of 50+ = stability 0
const avgVariance = totalVariance / 4
return Math.max(0, 1 - avgVariance / 50)
},
[]
)
// Find all quadrilaterals in the frame using OpenCV
const findAllQuads = useCallback(
(frameCanvas: HTMLCanvasElement): DetectedQuad[] => {
const cv = cvRef.current
if (!cv) return []
const quads: DetectedQuad[] = []
const frameArea = frameCanvas.width * frameCanvas.height
const frameDiagonal = Math.sqrt(frameCanvas.width ** 2 + frameCanvas.height ** 2)
// OpenCV processing
let src: CVMat | null = null
let gray: CVMat | null = null
let blurred: CVMat | null = null
let edges: CVMat | null = null
let contours: CVMatVector | null = null
let hierarchy: CVMat | null = null
try {
src = cv.imread(frameCanvas)
gray = new cv.Mat()
blurred = new cv.Mat()
edges = new cv.Mat()
// Convert to grayscale
cv.cvtColor(src, gray, cv.COLOR_RGBA2GRAY)
// Blur to reduce noise
cv.GaussianBlur(gray, blurred, new cv.Size(5, 5), 0, 0, cv.BORDER_DEFAULT)
// Edge detection
cv.Canny(blurred, edges, 50, 150)
// Dilate edges to connect gaps
const kernel = new cv.Mat()
cv.dilate(edges, edges, kernel, { x: -1, y: -1 } as CVPoint, 1)
kernel.delete()
// Find contours
contours = new cv.MatVector()
hierarchy = new cv.Mat()
cv.findContours(edges, contours, hierarchy, cv.RETR_LIST, cv.CHAIN_APPROX_SIMPLE)
// Process each contour
for (let i = 0; i < contours.size(); i++) {
const contour = contours.get(i)
const area = cv.contourArea(contour)
const areaRatio = area / frameArea
// Skip if too small or too large
if (areaRatio < MIN_AREA_RATIO || areaRatio > MAX_AREA_RATIO) {
continue
}
// Approximate to polygon
const approx = new cv.Mat()
const perimeter = cv.arcLength(contour, true)
cv.approxPolyDP(contour, approx, 0.02 * perimeter, true)
// Check if it's a quadrilateral
if (approx.rows === 4) {
// Extract corners
const corners: Array<{ x: number; y: number }> = []
for (let j = 0; j < 4; j++) {
corners.push({
x: approx.data32S[j * 2],
y: approx.data32S[j * 2 + 1],
})
}
// Order corners consistently
const orderedCorners = orderCorners(corners)
// Calculate aspect ratio
const width = distance(orderedCorners[0], orderedCorners[1])
const height = distance(orderedCorners[1], orderedCorners[2])
const aspectRatio = Math.max(width, height) / Math.min(width, height)
// Check if aspect ratio is document-like
if (isDocumentAspectRatio(aspectRatio)) {
quads.push({
corners: orderedCorners,
area,
aspectRatio,
centerId: getQuadCenterId(orderedCorners, frameCanvas.width, frameCanvas.height),
})
}
}
approx.delete()
}
} finally {
// Clean up OpenCV memory
src?.delete()
gray?.delete()
blurred?.delete()
edges?.delete()
contours?.delete()
hierarchy?.delete()
}
// Sort by area (largest first)
quads.sort((a, b) => b.area - a.area)
return quads
},
[distance, orderCorners, isDocumentAspectRatio, getQuadCenterId]
)
// Update tracked quads with new detections
const updateTrackedQuads = useCallback(
(
detectedQuads: DetectedQuad[],
frameWidth: number,
frameHeight: number
): TrackedQuad | null => {
const currentFrame = frameCountRef.current++
const trackedQuads = trackedQuadsRef.current
const frameDiagonal = Math.sqrt(frameWidth ** 2 + frameHeight ** 2)
// Mark all tracked quads as not seen this frame
const seenIds = new Set<string>()
// Match detected quads to tracked quads
for (const detected of detectedQuads) {
let matched = false
for (const [id, tracked] of trackedQuads) {
if (!seenIds.has(id) && quadsMatch(detected.corners, tracked.corners, frameDiagonal)) {
// Update existing tracked quad
tracked.corners = detected.corners
tracked.area = detected.area
tracked.aspectRatio = detected.aspectRatio
tracked.frameCount++
tracked.lastSeenFrame = currentFrame
tracked.cornerHistory.push(detected.corners)
if (tracked.cornerHistory.length > HISTORY_LENGTH) {
tracked.cornerHistory.shift()
}
tracked.stabilityScore = calculateCornerStability(tracked.cornerHistory)
seenIds.add(id)
matched = true
break
}
}
if (!matched) {
// New quad - start tracking
const newId = `quad_${currentFrame}_${Math.random().toString(36).slice(2, 8)}`
trackedQuads.set(newId, {
id: newId,
corners: detected.corners,
area: detected.area,
aspectRatio: detected.aspectRatio,
frameCount: 1,
lastSeenFrame: currentFrame,
stabilityScore: 0,
cornerHistory: [detected.corners],
})
seenIds.add(newId)
}
}
// Remove quads not seen for a while
for (const [id, tracked] of trackedQuads) {
if (currentFrame - tracked.lastSeenFrame > 3) {
trackedQuads.delete(id)
}
}
// Find best quad (highest score = frameCount * stability * area)
let bestQuad: TrackedQuad | null = null
let bestScore = 0
for (const tracked of trackedQuads.values()) {
// Only consider quads seen recently
if (currentFrame - tracked.lastSeenFrame > 2) continue
// Score: prioritize stability and longevity, then area
const score = tracked.frameCount * (0.5 + tracked.stabilityScore) * Math.sqrt(tracked.area)
if (score > bestScore) {
bestScore = score
bestQuad = tracked
}
}
bestQuadRef.current = bestQuad
return bestQuad
},
[quadsMatch, calculateCornerStability]
)
// Draw quad on overlay canvas
const drawQuad = useCallback(
(
@ -725,8 +368,9 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
const highlightDocument = useCallback(
(video: HTMLVideoElement, overlayCanvas: HTMLCanvasElement): boolean => {
const cv = cvRef.current
if (!cv) return false
const detector = detectorRef.current
const tracker = trackerRef.current
if (!detector || !tracker) return false
const startTime = performance.now()
@ -755,29 +399,32 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
// Clear overlay
overlayCtx.clearRect(0, 0, overlayCanvas.width, overlayCanvas.height)
// Find all quads in this frame
const detectedQuads = findAllQuads(frameCanvas)
// Use modular detector
const detectedQuads = detector.detect(frameCanvas)
// Update tracking and get best quad
const bestQuad = updateTrackedQuads(detectedQuads, frameCanvas.width, frameCanvas.height)
// Use modular tracker
const bestQuad = tracker.update(detectedQuads, {
width: frameCanvas.width,
height: frameCanvas.height,
})
const stats = tracker.getStats()
const detectionTime = performance.now() - startTime
// Draw all detected quads (faded) for debugging
for (const quad of detectedQuads) {
if (bestQuad && quad.centerId === bestQuad.id) continue
drawQuad(overlayCtx, quad.corners, 'rgba(100, 100, 100, 0.3)', 2)
}
// Draw best quad with color based on stability
if (bestQuad) {
const isStable = bestQuad.frameCount >= MIN_FRAMES_FOR_STABLE
const isLocked = bestQuad.frameCount >= LOCKED_FRAME_COUNT
// Update bestQuadRef for extractDocument
bestQuadRef.current = bestQuad
let color: string
let lineWidth: number
if (isLocked && bestQuad.stabilityScore > 0.5) {
if (bestQuad.isLocked) {
color = 'rgba(0, 255, 100, 0.95)'
lineWidth = 6
// Save stable frame
@ -788,7 +435,7 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
lastStableFrameRef.current.height = frameCanvas.height
const stableCtx = lastStableFrameRef.current.getContext('2d')
stableCtx?.drawImage(frameCanvas, 0, 0)
} else if (isStable) {
} else if (bestQuad.isStable) {
color = 'rgba(100, 255, 100, 0.85)'
lineWidth = 5
} else {
@ -797,6 +444,8 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
}
drawQuad(overlayCtx, bestQuad.corners, color, lineWidth)
} else {
bestQuadRef.current = null
}
// Update debug info
@ -804,7 +453,7 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
...prev,
lastDetectionMs: Math.round(detectionTime),
quadsDetected: detectedQuads.length,
trackedQuads: trackedQuadsRef.current.size,
trackedQuads: stats.trackedCount,
bestQuadStability: bestQuad?.stabilityScore ?? 0,
bestQuadFrameCount: bestQuad?.frameCount ?? 0,
lastDetectionError: null,
@ -819,7 +468,7 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
return false
}
},
[captureVideoFrame, findAllQuads, updateTrackedQuads, drawQuad]
[captureVideoFrame, drawQuad]
)
/**
@ -1085,11 +734,24 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
[captureVideoFrame, distance, analyzeOrientation, rotateCanvas]
)
// Compute derived state
// Reset tracking state (call when returning from adjustment mode)
const resetTracking = useCallback(() => {
trackerRef.current?.reset()
bestQuadRef.current = null
lastStableFrameRef.current = null
setDebugInfo((prev) => ({
...prev,
quadsDetected: 0,
trackedQuads: 0,
bestQuadStability: 0,
bestQuadFrameCount: 0,
}))
}, [])
// Compute derived state (use isStable/isLocked from tracked quad)
const bestQuad = bestQuadRef.current
const isStable = bestQuad ? bestQuad.frameCount >= MIN_FRAMES_FOR_STABLE : false
const isLocked =
bestQuad && bestQuad.frameCount >= LOCKED_FRAME_COUNT && bestQuad.stabilityScore > 0.5
const isStable = bestQuad?.isStable ?? false
const isLocked = bestQuad?.isLocked ?? false
// Get current best quad corners
const getBestQuadCorners = useCallback((): Array<{
@ -1152,59 +814,57 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
* Detect quads in a static image (for file uploads and gallery edits)
* Returns detected corners or fallback corners (full image)
*/
const detectQuadsInImage = useCallback(
(canvas: HTMLCanvasElement): DetectQuadsInImageResult => {
// Fallback corners (full image)
const fallbackCorners = [
{ x: 0, y: 0 },
{ x: canvas.width, y: 0 },
{ x: canvas.width, y: canvas.height },
{ x: 0, y: canvas.height },
]
const detectQuadsInImage = useCallback((canvas: HTMLCanvasElement): DetectQuadsInImageResult => {
// Fallback corners (full image)
const fallbackCorners = [
{ x: 0, y: 0 },
{ x: canvas.width, y: 0 },
{ x: canvas.width, y: canvas.height },
{ x: 0, y: canvas.height },
]
if (!cvRef.current) {
const detector = detectorRef.current
if (!detector) {
return {
detected: false,
corners: fallbackCorners,
sourceCanvas: canvas,
}
}
try {
// Use modular detector
const detectedQuads = detector.detect(canvas)
if (detectedQuads.length > 0) {
// Return the best quad (largest area, already sorted)
return {
detected: false,
corners: fallbackCorners,
detected: true,
corners: detectedQuads[0].corners,
sourceCanvas: canvas,
}
}
try {
// Run quad detection on the canvas
const detectedQuads = findAllQuads(canvas)
if (detectedQuads.length > 0) {
// Return the best quad (largest area, already sorted)
return {
detected: true,
corners: detectedQuads[0].corners,
sourceCanvas: canvas,
}
}
// No quads detected - return fallback
return {
detected: false,
corners: fallbackCorners,
sourceCanvas: canvas,
}
} catch (err) {
console.warn('Quad detection failed:', err)
return {
detected: false,
corners: fallbackCorners,
sourceCanvas: canvas,
}
// No quads detected - return fallback
return {
detected: false,
corners: fallbackCorners,
sourceCanvas: canvas,
}
},
[findAllQuads]
)
} catch (err) {
console.warn('Quad detection failed:', err)
return {
detected: false,
corners: fallbackCorners,
sourceCanvas: canvas,
}
}
}, [])
return {
isLoading,
error,
isReady: !isLoading && !error && cvRef.current !== null,
isReady: !isLoading && !error && detectorRef.current !== null,
ensureOpenCVLoaded,
isStable,
isLocked: !!isLocked,
@ -1216,6 +876,9 @@ export function useDocumentDetection(): UseDocumentDetectionReturn {
extractDocument,
detectQuadsInImage,
loadImageToCanvas,
resetTracking,
updateDetectorConfig,
detectorConfig,
}
}

View File

@ -9,11 +9,7 @@
import { useCallback, useRef, useState } from 'react'
import type { CV } from '@/lib/vision/opencv/types'
import {
loadOpenCV as loadOpenCVCore,
getOpenCV,
isOpenCVReady,
} from '@/lib/vision/opencv/loader'
import { loadOpenCV as loadOpenCVCore, getOpenCV, isOpenCVReady } from '@/lib/vision/opencv/loader'
export interface UseOpenCVReturn {
/** OpenCV instance (null if not loaded) */

View File

@ -24,7 +24,12 @@ export type {
} from '@/lib/vision/quadDetection'
// Re-export utility functions
export { loadImageToCanvas, captureVideoFrame, orderCorners, distance } from '@/lib/vision/quadDetection'
export {
loadImageToCanvas,
captureVideoFrame,
orderCorners,
distance,
} from '@/lib/vision/quadDetection'
/**
* React hook for quad detection in static images.
@ -89,13 +94,10 @@ export function useQuadDetection(options?: QuadDetectionOptions) {
* Detect quads in a canvas.
* Returns null if detector is not loaded.
*/
const detect = useCallback(
(canvas: HTMLCanvasElement): QuadDetectionResult | null => {
if (!detectorRef.current) return null
return detectorRef.current.detect(canvas, optionsRef.current)
},
[]
)
const detect = useCallback((canvas: HTMLCanvasElement): QuadDetectionResult | null => {
if (!detectorRef.current) return null
return detectorRef.current.detect(canvas, optionsRef.current)
}, [])
/**
* Detect quads in an image file.
@ -155,10 +157,13 @@ export function useQuadDetection(options?: QuadDetectionOptions) {
/**
* Extract a quad region using perspective transform.
*/
const extract = useCallback((canvas: HTMLCanvasElement, corners: Corner[]): HTMLCanvasElement | null => {
if (!detectorRef.current) return null
return detectorRef.current.extract(canvas, corners)
}, [])
const extract = useCallback(
(canvas: HTMLCanvasElement, corners: Corner[]): HTMLCanvasElement | null => {
if (!detectorRef.current) return null
return detectorRef.current.extract(canvas, corners)
},
[]
)
/**
* Analyze document orientation.

View File

@ -0,0 +1,241 @@
# Document Detection Research
Research notes on improving quad/document detection, particularly for handling finger occlusion and complex backgrounds.
**Date**: January 2026
**Context**: Current OpenCV-based quad detection struggles with finger occlusion and busy backgrounds.
---
## The Core Problem
Standard Canny edge detection fails for document scanning because:
> "The sections of text inside the document are strongly amplified, whereas the document edges—what we're interested in—show up very weakly."
> — Dropbox Engineering
Traditional CV approaches (Canny + Hough) can only work with **visible edges**. When fingers occlude document corners, the edge pixels simply aren't there.
---
## Industry Approaches
### Dropbox (2016)
**Source**: [Fast and Accurate Document Detection for Scanning](https://dropbox.tech/machine-learning/fast-and-accurate-document-detection-for-scanning)
1. **Custom ML-based edge detector** - trained to suppress text edges while keeping document boundaries (details proprietary)
2. **Hough transform** for line detection from the cleaned edge map
3. **Quadrilateral scoring** - enumerate all possible quads from line intersections, score each by summing edge probabilities along perimeter
4. **Result**: 8-10 FPS, 60% fewer manual corrections vs Apple's SDK
Follow-up: [Improving the Responsiveness of the Document Detector](https://dropbox.tech/machine-learning/improving-the-responsiveness-of-the-document-detector)
- Motion-based quad tracking between frames
- Hybrid: full detection every ~100ms + fast tracking on intermediate frames
### Genius Scan (2024)
**Source**: [Document Detection - How Deep Learning Has Changed The Game](https://blog.thegrizzlylabs.com/2024/10/document-detection.html)
**Key insight**: Combining DL + traditional CV raised accuracy from 51% → 75% → 85%:
- DL provides **robustness** (handles occlusion, complex backgrounds)
- Traditional CV provides **precision** (sub-pixel corner refinement)
Architecture:
- MobileNet V2 backbone
- Input resolution: 96×96 pixels
- Training dataset: 1M+ images
- Pre-training: ImageNet, fine-tuned on document data
- Performance: 25+ FPS on mobile
### Scanner Pro (Readdle)
**Source**: [Inside ScannerPro: the Tech behind perfect scans](https://readdle.com/blog/scanner-pro-border-detection)
Evolution:
1. Traditional CV (Canny + Hough) - baseline
2. Semantic segmentation - too slow (5 FPS on iPhone X)
3. **Keypoint detection** - direct corner prediction, 30+ FPS
Key techniques:
- MobileNet-based keypoint detector
- Kalman filter + IMU data for temporal smoothing
- Two-stage: lightweight detector for streaming, heavier model on capture
### Academic Approaches
**Multi-document detection via corner localization**:
- Joint Corner Detector (JCD) with attention mechanism
- Coarse-to-fine: rough prediction → corner-specific refinement
- Datasets: ICDAR 2015 SmartDoc, SEECS-NUSF, MIDV-500
**Semantic segmentation** (LearnOpenCV tutorial):
- DeepLabv3 with MobileNetV3-Large backbone
- Binary segmentation (document vs background)
- Trained on synthetic data with augmentation
- Extract corners from mask via contour detection
---
## Modern Architecture Pattern
The recommended approach for robust document detection:
```
Input Image (downscaled to 96-256px)
┌───────────────────┐
│ MobileNetV2/V3 │ ← Pretrained on ImageNet
│ Feature Extractor│
└─────────┬─────────┘
┌───────────────────┐
│ Regression Head │ ← 8 outputs (x,y for 4 corners)
│ (or Heatmap Head)│
└─────────┬─────────┘
Corner Coordinates
┌───────────────────┐
│ Optional: CV │ ← Sub-pixel refinement
│ Refinement │
└───────────────────┘
```
**Why this works for occlusion**: The network learns document shape priors and can predict where corners *should* be even when they're not visible.
---
## Hugging Face Models
### Document-Specific
**[ordaktaktak/Document-Scanner](https://huggingface.co/ordaktaktak/Document-Scanner)**
- Architecture: U-Net semantic segmentation
- Input: Grayscale 256×256
- Output: Binary mask → extract corners via contour detection
- Framework: PyTorch (.pth weights)
- Status: Would need conversion to ONNX/TF.js
### Background Removal (Could Adapt)
**[briaai/RMBG-2.0](https://huggingface.co/briaai/RMBG-2.0)**
- Architecture: BiRefNet (0.2B params) - too large for real-time
- Input: 1024×1024 RGB
- Output: Alpha matte
- Transformers.js compatible
- License: CC BY-NC 4.0 (non-commercial)
**[briaai/RMBG-1.4](https://huggingface.co/briaai/RMBG-1.4)**
- Smaller version (44.1M params)
- Same approach, might be more practical
### General Segmentation (Transformers.js Ready)
| Model | Size | Use Case |
|-------|------|----------|
| `Xenova/deeplabv3-mobilevit-xx-small` | Tiny | Fast, low accuracy |
| `Xenova/deeplabv3-mobilevit-small` | Small | Balanced |
| `Xenova/deeplabv3-mobilevit-x-small` | X-Small | Middle ground |
| `nnny/onnx-mobile-sam` | ~5MB | General segmentation with prompts |
### SAM-based Approach
Could use Segment Anything Model with point prompts:
1. User taps roughly in document area
2. SAM segments the document
3. Extract corners from segmentation mask
Models: `nnny/onnx-mobile-sam`, SlimSAM variants
---
## Implementation Options
### Option 1: Convert Document-Scanner to ONNX
```bash
# Download PyTorch model
# Convert with torch.onnx.export()
# Use with ONNX Runtime Web
import torch
model = load_document_scanner_model()
dummy_input = torch.randn(1, 1, 256, 256) # grayscale
torch.onnx.export(model, dummy_input, "document_scanner.onnx")
```
Pros: Purpose-built for documents
Cons: Still outputs mask, need CV for corners
### Option 2: SAM with Point Prompts
```typescript
import { pipeline } from '@huggingface/transformers';
const segmenter = await pipeline('image-segmentation', 'nnny/onnx-mobile-sam');
const result = await segmenter(image, { points: [[centerX, centerY]] });
// Extract corners from mask
```
Pros: No training needed, handles complex shapes
Cons: Requires user interaction (point prompt)
### Option 3: Train Custom Corner Detector
Training a lightweight model specifically for corner prediction:
1. **Architecture**: MobileNetV2 → 8-output regression (4 corners × 2 coords)
2. **Training data**:
- SmartDoc dataset (ICDAR)
- DocVQA documents on backgrounds
- Synthetic: random quads with augmentation
- **Critical**: Include finger occlusion augmentation
3. **Output**: Normalized corner coordinates [0,1]
4. **Export**: TensorFlow.js or ONNX
This is what Dropbox, Genius Scan, and Scanner Pro actually do.
### Option 4: Hybrid (Recommended for Production)
1. **Primary**: Lightweight CNN corner predictor (handles occlusion)
2. **Refinement**: Traditional CV on predicted region (sub-pixel accuracy)
3. **Tracking**: Kalman filter for temporal stability
---
## Datasets
| Dataset | Size | Notes |
|---------|------|-------|
| SmartDoc (ICDAR 2015) | 4,260 images | Competition dataset, labeled corners |
| MIDV-500 | 500 video clips | ID documents, challenging conditions |
| DocVQA | 50K+ images | Document images (need corner labels) |
| Synthetic | Unlimited | Generate documents on backgrounds |
---
## Key Takeaways
1. **Traditional CV (our current approach) will always struggle with occlusion** - it can only see visible edges
2. **The industry solution is learned corner prediction** - networks trained on documents learn shape priors
3. **Hybrid approaches work best** - DL for robustness, CV for precision
4. **No ready-to-use model exists** on Hugging Face that:
- Is specifically trained for document corners
- Handles finger occlusion
- Is already in TF.js/ONNX format
5. **Realistic path forward**:
- Short term: Try SAM with prompts, or convert Document-Scanner to ONNX
- Long term: Train custom MobileNet-based corner detector
---
## References
- [Dropbox: Fast and Accurate Document Detection](https://dropbox.tech/machine-learning/fast-and-accurate-document-detection-for-scanning)
- [Dropbox: Improving Responsiveness](https://dropbox.tech/machine-learning/improving-the-responsiveness-of-the-document-detector)
- [Genius Scan: Document Detection with Deep Learning](https://blog.thegrizzlylabs.com/2024/10/document-detection.html)
- [Scanner Pro: Tech Behind Perfect Scans](https://readdle.com/blog/scanner-pro-border-detection)
- [LearnOpenCV: Document Segmentation with DeepLabv3](https://learnopencv.com/deep-learning-based-document-segmentation-using-semantic-segmentation-deeplabv3-on-custom-dataset/)
- [Transformers.js Documentation](https://huggingface.co/docs/transformers.js/en/index)
- [U-Net Paper (arxiv 1505.04597)](https://arxiv.org/abs/1505.04597)

View File

@ -6,7 +6,7 @@
*/
export async function simpleDelay(ms: number): Promise<string> {
console.log('[simpleAsync] simpleDelay called with', ms)
await new Promise(resolve => setTimeout(resolve, ms))
await new Promise((resolve) => setTimeout(resolve, ms))
console.log('[simpleAsync] delay complete')
return 'done'
}

View File

@ -7,7 +7,9 @@
export interface CVMat {
delete: () => void
copyTo: (dst: CVMat) => void
data32S: Int32Array
data32F: Float32Array
data: ArrayBuffer
rows: number
cols: number
@ -46,7 +48,48 @@ export interface CV {
borderType: number
) => void
Canny: (src: CVMat, dst: CVMat, t1: number, t2: number) => void
Sobel: (src: CVMat, dst: CVMat, ddepth: number, dx: number, dy: number, ksize?: number) => void
addWeighted: (
src1: CVMat,
alpha: number,
src2: CVMat,
beta: number,
gamma: number,
dst: CVMat
) => void
convertScaleAbs: (src: CVMat, dst: CVMat, alpha?: number, beta?: number) => void
equalizeHist: (src: CVMat, dst: CVMat) => void
adaptiveThreshold: (
src: CVMat,
dst: CVMat,
maxValue: number,
adaptiveMethod: number,
thresholdType: number,
blockSize: number,
C: number
) => void
threshold: (src: CVMat, dst: CVMat, thresh: number, maxval: number, type: number) => number
bilateralFilter: (
src: CVMat,
dst: CVMat,
d: number,
sigmaColor: number,
sigmaSpace: number,
borderType?: number
) => void
morphologyEx: (
src: CVMat,
dst: CVMat,
op: number,
kernel: CVMat,
anchor?: CVPoint,
iterations?: number
) => void
getStructuringElement: (shape: number, ksize: CVSize, anchor?: CVPoint) => CVMat
erode: (src: CVMat, dst: CVMat, kernel: CVMat, anchor?: CVPoint, iterations?: number) => void
dilate: (src: CVMat, dst: CVMat, kernel: CVMat, anchor: CVPoint, iterations: number) => void
bitwise_or: (src1: CVMat, src2: CVMat, dst: CVMat) => void
bitwise_and: (src1: CVMat, src2: CVMat, dst: CVMat) => void
findContours: (
src: CVMat,
contours: CVMatVector,
@ -57,6 +100,17 @@ export interface CV {
contourArea: (contour: CVMat) => number
arcLength: (contour: CVMat, closed: boolean) => number
approxPolyDP: (contour: CVMat, approx: CVMat, epsilon: number, closed: boolean) => void
convexHull: (src: CVMat, dst: CVMat, clockwise: boolean, returnPoints: boolean) => void
// Hough line detection
HoughLinesP: (
src: CVMat,
lines: CVMat,
rho: number,
theta: number,
threshold: number,
minLineLength?: number,
maxLineGap?: number
) => void
getPerspectiveTransform: (src: CVMat, dst: CVMat) => CVMat
warpPerspective: (
src: CVMat,
@ -72,13 +126,34 @@ export interface CV {
COLOR_RGBA2GRAY: number
BORDER_DEFAULT: number
RETR_LIST: number
RETR_EXTERNAL: number
CHAIN_APPROX_SIMPLE: number
CV_32FC2: number
CV_32SC4: number
CV_8U: number
CV_16S: number
CV_64F: number
INTER_LINEAR: number
BORDER_CONSTANT: number
ROTATE_90_CLOCKWISE: number
ROTATE_180: number
ROTATE_90_COUNTERCLOCKWISE: number
// Threshold types
THRESH_BINARY: number
THRESH_BINARY_INV: number
THRESH_OTSU: number
// Adaptive threshold methods
ADAPTIVE_THRESH_MEAN_C: number
ADAPTIVE_THRESH_GAUSSIAN_C: number
// Morphological operations
MORPH_RECT: number
MORPH_ELLIPSE: number
MORPH_CROSS: number
MORPH_OPEN: number
MORPH_CLOSE: number
MORPH_GRADIENT: number
MORPH_DILATE: number
MORPH_ERODE: number
}
/**

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,266 @@
'use client'
/**
* useQuadDetection Hook
*
* A React hook that combines OpenCV loading, quad detection, and temporal tracking.
* Provides a clean API for detecting quadrilaterals in both camera feeds and static images.
*
* Usage:
* ```tsx
* import { OpenCvProvider } from 'opencv-react'
* import { useQuadDetection } from '@/lib/vision/useQuadDetection'
*
* // Wrap your app/page with OpenCvProvider
* <OpenCvProvider>
* <MyComponent />
* </OpenCvProvider>
*
* // In your component:
* function MyComponent() {
* const {
* isReady,
* detectInImage,
* processFrame,
* trackedQuad,
* stats,
* resetTracking,
* } = useQuadDetection()
*
* // For static images:
* const quads = detectInImage(canvas)
*
* // For camera feeds (call each frame):
* const bestQuad = processFrame(videoFrame)
* }
* ```
*/
import { useCallback, useMemo, useRef } from 'react'
import { useOpenCv } from 'opencv-react'
import {
createQuadDetector,
type DetectedQuad,
type DebugPolygon,
type QuadDetectorConfig,
} from './quadDetector'
import { createQuadTracker, type TrackedQuad, type QuadTrackerConfig } from './quadTracker'
import type { CV } from './opencv/types'
// Re-export types for convenience
export type { DetectedQuad, Point, DebugPolygon } from './quadDetector'
export type { TrackedQuad } from './quadTracker'
export type { QuadDetectorConfig } from './quadDetector'
export type { QuadTrackerConfig } from './quadTracker'
/** Configuration for useQuadDetection */
export interface UseQuadDetectionConfig {
/** Quad detector configuration */
detector?: Partial<QuadDetectorConfig>
/** Quad tracker configuration */
tracker?: Partial<QuadTrackerConfig>
}
/** Stats returned by the hook */
export interface QuadDetectionStats {
/** Number of quads currently being tracked */
trackedCount: number
/** Total frames processed */
frameCount: number
/** Stability score of the best quad (0-1) */
bestStability: number
/** Frame count of the best quad */
bestFrameCount: number
}
/** Result from processing a single frame */
export interface FrameProcessingResult {
/** Best tracked quad, or null if none */
trackedQuad: TrackedQuad | null
/** All quads detected in this frame (before tracking) */
detectedQuads: DetectedQuad[]
/** Current tracking statistics */
stats: QuadDetectionStats
}
/** Return type of useQuadDetection */
export interface UseQuadDetectionReturn {
/** Whether OpenCV is loaded and detector is ready */
isReady: boolean
/** Whether OpenCV is currently loading */
isLoading: boolean
/** Error message if loading failed */
error: string | null
/**
* Detect quads in a static image (one-shot, no tracking)
* @param source - Canvas to detect in
* @returns Array of detected quads, sorted by area (largest first)
*/
detectInImage: (source: HTMLCanvasElement) => DetectedQuad[]
/**
* Detect quads with debug info about all candidate polygons.
* Use this to understand why detection is failing.
* @param source - Canvas to detect in
* @returns Quads and debug info about all candidates
*/
detectWithDebug: (source: HTMLCanvasElement) => {
quads: DetectedQuad[]
debugPolygons: DebugPolygon[]
}
/**
* Process a video frame with tracking
* Call this each frame for camera/video feeds
* @param source - Canvas from video frame
* @param frameSize - Optional explicit frame size (inferred from source if not provided)
* @returns Frame processing result with tracked quad, detected quads, and stats
*/
processFrame: (
source: HTMLCanvasElement,
frameSize?: { width: number; height: number }
) => FrameProcessingResult
/** The current best tracked quad */
trackedQuad: TrackedQuad | null
/** All currently tracked quads */
allTrackedQuads: TrackedQuad[]
/** Current tracking statistics */
stats: QuadDetectionStats
/** Reset all tracking state (call when switching cameras, etc.) */
resetTracking: () => void
}
/**
* React hook for quad detection with optional temporal tracking.
*
* Must be used inside an OpenCvProvider from 'opencv-react'.
*
* @param config - Optional configuration for detector and tracker
*/
export function useQuadDetection(config?: UseQuadDetectionConfig): UseQuadDetectionReturn {
const { loaded: opencvLoaded, cv } = useOpenCv()
// Track the current best quad in a ref for synchronous access
const trackedQuadRef = useRef<TrackedQuad | null>(null)
const allTrackedRef = useRef<TrackedQuad[]>([])
const statsRef = useRef<QuadDetectionStats>({
trackedCount: 0,
frameCount: 0,
bestStability: 0,
bestFrameCount: 0,
})
// Create detector when cv is available
const detector = useMemo(() => {
if (!opencvLoaded || !cv) return null
try {
return createQuadDetector(cv as CV, config?.detector)
} catch (err) {
console.error('[useQuadDetection] Failed to create detector:', err)
return null
}
}, [opencvLoaded, cv, config?.detector])
// Create tracker (doesn't need cv)
const tracker = useMemo(() => createQuadTracker(config?.tracker), [config?.tracker])
// Detect in static image (no tracking)
const detectInImage = useCallback(
(source: HTMLCanvasElement): DetectedQuad[] => {
if (!detector) {
console.warn('[useQuadDetection] detectInImage called before detector ready')
return []
}
return detector.detect(source)
},
[detector]
)
// Detect with debug info (for debugging detection issues)
const detectWithDebug = useCallback(
(source: HTMLCanvasElement): { quads: DetectedQuad[]; debugPolygons: DebugPolygon[] } => {
if (!detector) {
console.warn('[useQuadDetection] detectWithDebug called before detector ready')
return { quads: [], debugPolygons: [] }
}
return detector.detectWithDebug(source)
},
[detector]
)
// Process video frame with tracking
const processFrame = useCallback(
(
source: HTMLCanvasElement,
frameSize?: { width: number; height: number }
): FrameProcessingResult => {
if (!detector) {
return {
trackedQuad: null,
detectedQuads: [],
stats: statsRef.current,
}
}
// Detect quads in frame
const quads = detector.detect(source)
// Determine frame size
const size = frameSize ?? {
width: source.width,
height: source.height,
}
// Update tracker
const bestQuad = tracker.update(quads, size)
const currentStats = tracker.getStats()
const allTracked = tracker.getAllTracked()
// Update refs
trackedQuadRef.current = bestQuad
allTrackedRef.current = allTracked
statsRef.current = currentStats
return {
trackedQuad: bestQuad,
detectedQuads: quads,
stats: currentStats,
}
},
[detector, tracker]
)
// Reset tracking
const resetTracking = useCallback(() => {
tracker.reset()
trackedQuadRef.current = null
allTrackedRef.current = []
statsRef.current = {
trackedCount: 0,
frameCount: 0,
bestStability: 0,
bestFrameCount: 0,
}
}, [tracker])
return {
isReady: !!detector,
isLoading: !opencvLoaded,
error: null, // opencv-react doesn't expose errors directly
detectInImage,
detectWithDebug,
processFrame,
trackedQuad: trackedQuadRef.current,
allTrackedQuads: allTrackedRef.current,
stats: statsRef.current,
resetTracking,
}
}