Add raw HTML archiving for historical re-parsing

Store complete HTML response in raw_html column before extraction. Enables re-running selectors on historical scrapes when sites change their DOM structure or CSS classes.
This commit is contained in:
Soldier 2025-11-16 08:43:46 +00:00
parent 405f9ca173
commit 985d340855
2 changed files with 30 additions and 8 deletions

View File

@ -15,6 +15,7 @@ type Job struct {
URL string URL string
Selector string Selector string
ExtractedContent string ExtractedContent string
RawHTML string
} }
func Init(dbPath string) (*sql.DB, error) { func Init(dbPath string) (*sql.DB, error) {
@ -31,7 +32,8 @@ func Init(dbPath string) (*sql.DB, error) {
webhook_url TEXT, webhook_url TEXT,
url TEXT, url TEXT,
selector TEXT, selector TEXT,
extracted_content TEXT extracted_content TEXT,
raw_html TEXT
);` );`
_, err = db.Exec(schema) _, err = db.Exec(schema)
@ -51,7 +53,7 @@ func CreateJob(db *sql.DB, webhookURL, url, selector string) (int64, error) {
} }
func GetPendingJobs(db *sql.DB) ([]Job, error) { func GetPendingJobs(db *sql.DB) ([]Job, error) {
rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, selector, extracted_content FROM jobs WHERE status = 'pending'") rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, selector, extracted_content, raw_html FROM jobs WHERE status = 'pending'")
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -60,14 +62,15 @@ func GetPendingJobs(db *sql.DB) ([]Job, error) {
var jobs []Job var jobs []Job
for rows.Next() { for rows.Next() {
var job Job var job Job
var webhookURL, url, selector, extractedContent sql.NullString var webhookURL, url, selector, extractedContent, rawHTML sql.NullString
if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &selector, &extractedContent); err != nil { if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &selector, &extractedContent, &rawHTML); err != nil {
return nil, err return nil, err
} }
job.WebhookURL = webhookURL.String job.WebhookURL = webhookURL.String
job.URL = url.String job.URL = url.String
job.Selector = selector.String job.Selector = selector.String
job.ExtractedContent = extractedContent.String job.ExtractedContent = extractedContent.String
job.RawHTML = rawHTML.String
jobs = append(jobs, job) jobs = append(jobs, job)
} }
return jobs, nil return jobs, nil
@ -78,6 +81,11 @@ func UpdateJobContent(db *sql.DB, id int, content string) error {
return err return err
} }
func UpdateJobHTML(db *sql.DB, id int, html string) error {
_, err := db.Exec("UPDATE jobs SET raw_html = ? WHERE id = ?", html, id)
return err
}
func MarkJobDone(db *sql.DB, id int) error { func MarkJobDone(db *sql.DB, id int) error {
_, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id) _, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id)
return err return err

View File

@ -118,11 +118,25 @@ func Start(database *sql.DB) {
status = "error" status = "error"
} else { } else {
defer resp.Body.Close() defer resp.Body.Close()
content = extractContent(resp.Body, job.Selector)
log.Printf("Job %d: Extracted content (%d chars)", job.ID, len(content))
if err := db.UpdateJobContent(database, job.ID, content); err != nil { htmlBytes, err := io.ReadAll(resp.Body)
log.Printf("Job %d: Error updating content: %v", job.ID, err) if err != nil {
log.Printf("Job %d: Error reading response: %v", job.ID, err)
status = "error"
} else {
rawHTML := string(htmlBytes)
log.Printf("Job %d: Saved HTML (%d bytes)", job.ID, len(rawHTML))
if err := db.UpdateJobHTML(database, job.ID, rawHTML); err != nil {
log.Printf("Job %d: Error updating HTML: %v", job.ID, err)
}
content = extractContent(bytes.NewReader(htmlBytes), job.Selector)
log.Printf("Job %d: Extracted content (%d chars)", job.ID, len(content))
if err := db.UpdateJobContent(database, job.ID, content); err != nil {
log.Printf("Job %d: Error updating content: %v", job.ID, err)
}
} }
} }
} }