AlpenQueue/pkg/db/db.go
Soldier 985d340855 Add raw HTML archiving for historical re-parsing
Store complete HTML response in raw_html column before extraction. Enables re-running selectors on historical scrapes when sites change their DOM structure or CSS classes.
2025-11-16 08:43:46 +00:00

93 lines
2.2 KiB
Go

package db
import (
"database/sql"
"time"
_ "github.com/mattn/go-sqlite3"
)
type Job struct {
ID int
Status string
CreatedAt time.Time
WebhookURL string
URL string
Selector string
ExtractedContent string
RawHTML string
}
func Init(dbPath string) (*sql.DB, error) {
db, err := sql.Open("sqlite3", dbPath)
if err != nil {
return nil, err
}
schema := `
CREATE TABLE IF NOT EXISTS jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
status TEXT NOT NULL DEFAULT 'pending',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
webhook_url TEXT,
url TEXT,
selector TEXT,
extracted_content TEXT,
raw_html TEXT
);`
_, err = db.Exec(schema)
if err != nil {
return nil, err
}
return db, nil
}
func CreateJob(db *sql.DB, webhookURL, url, selector string) (int64, error) {
result, err := db.Exec("INSERT INTO jobs (status, webhook_url, url, selector) VALUES ('pending', ?, ?, ?)", webhookURL, url, selector)
if err != nil {
return 0, err
}
return result.LastInsertId()
}
func GetPendingJobs(db *sql.DB) ([]Job, error) {
rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, selector, extracted_content, raw_html FROM jobs WHERE status = 'pending'")
if err != nil {
return nil, err
}
defer rows.Close()
var jobs []Job
for rows.Next() {
var job Job
var webhookURL, url, selector, extractedContent, rawHTML sql.NullString
if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &selector, &extractedContent, &rawHTML); err != nil {
return nil, err
}
job.WebhookURL = webhookURL.String
job.URL = url.String
job.Selector = selector.String
job.ExtractedContent = extractedContent.String
job.RawHTML = rawHTML.String
jobs = append(jobs, job)
}
return jobs, nil
}
func UpdateJobContent(db *sql.DB, id int, content string) error {
_, err := db.Exec("UPDATE jobs SET extracted_content = ? WHERE id = ?", content, id)
return err
}
func UpdateJobHTML(db *sql.DB, id int, html string) error {
_, err := db.Exec("UPDATE jobs SET raw_html = ? WHERE id = ?", html, id)
return err
}
func MarkJobDone(db *sql.DB, id int) error {
_, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id)
return err
}