From 985d3408552e9fa4a2a8b6d27834b0ed303d82c2 Mon Sep 17 00:00:00 2001 From: Soldier Date: Sun, 16 Nov 2025 08:43:46 +0000 Subject: [PATCH] Add raw HTML archiving for historical re-parsing Store complete HTML response in raw_html column before extraction. Enables re-running selectors on historical scrapes when sites change their DOM structure or CSS classes. --- pkg/db/db.go | 16 ++++++++++++---- pkg/worker/worker.go | 22 ++++++++++++++++++---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/pkg/db/db.go b/pkg/db/db.go index 95e564d..4f23254 100644 --- a/pkg/db/db.go +++ b/pkg/db/db.go @@ -15,6 +15,7 @@ type Job struct { URL string Selector string ExtractedContent string + RawHTML string } func Init(dbPath string) (*sql.DB, error) { @@ -31,7 +32,8 @@ func Init(dbPath string) (*sql.DB, error) { webhook_url TEXT, url TEXT, selector TEXT, - extracted_content TEXT + extracted_content TEXT, + raw_html TEXT );` _, err = db.Exec(schema) @@ -51,7 +53,7 @@ func CreateJob(db *sql.DB, webhookURL, url, selector string) (int64, error) { } func GetPendingJobs(db *sql.DB) ([]Job, error) { - rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, selector, extracted_content FROM jobs WHERE status = 'pending'") + rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, selector, extracted_content, raw_html FROM jobs WHERE status = 'pending'") if err != nil { return nil, err } @@ -60,14 +62,15 @@ func GetPendingJobs(db *sql.DB) ([]Job, error) { var jobs []Job for rows.Next() { var job Job - var webhookURL, url, selector, extractedContent sql.NullString - if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &selector, &extractedContent); err != nil { + var webhookURL, url, selector, extractedContent, rawHTML sql.NullString + if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &selector, &extractedContent, &rawHTML); err != nil { return nil, err } job.WebhookURL = webhookURL.String job.URL = url.String job.Selector = selector.String job.ExtractedContent = extractedContent.String + job.RawHTML = rawHTML.String jobs = append(jobs, job) } return jobs, nil @@ -78,6 +81,11 @@ func UpdateJobContent(db *sql.DB, id int, content string) error { return err } +func UpdateJobHTML(db *sql.DB, id int, html string) error { + _, err := db.Exec("UPDATE jobs SET raw_html = ? WHERE id = ?", html, id) + return err +} + func MarkJobDone(db *sql.DB, id int) error { _, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id) return err diff --git a/pkg/worker/worker.go b/pkg/worker/worker.go index 1cc38f1..9ee1d74 100644 --- a/pkg/worker/worker.go +++ b/pkg/worker/worker.go @@ -118,11 +118,25 @@ func Start(database *sql.DB) { status = "error" } else { defer resp.Body.Close() - content = extractContent(resp.Body, job.Selector) - log.Printf("Job %d: Extracted content (%d chars)", job.ID, len(content)) - if err := db.UpdateJobContent(database, job.ID, content); err != nil { - log.Printf("Job %d: Error updating content: %v", job.ID, err) + htmlBytes, err := io.ReadAll(resp.Body) + if err != nil { + log.Printf("Job %d: Error reading response: %v", job.ID, err) + status = "error" + } else { + rawHTML := string(htmlBytes) + log.Printf("Job %d: Saved HTML (%d bytes)", job.ID, len(rawHTML)) + + if err := db.UpdateJobHTML(database, job.ID, rawHTML); err != nil { + log.Printf("Job %d: Error updating HTML: %v", job.ID, err) + } + + content = extractContent(bytes.NewReader(htmlBytes), job.Selector) + log.Printf("Job %d: Extracted content (%d chars)", job.ID, len(content)) + + if err := db.UpdateJobContent(database, job.ID, content); err != nil { + log.Printf("Job %d: Error updating content: %v", job.ID, err) + } } } }