Add raw HTML archiving for historical re-parsing
Store complete HTML response in raw_html column before extraction. Enables re-running selectors on historical scrapes when sites change their DOM structure or CSS classes.
This commit is contained in:
parent
405f9ca173
commit
985d340855
16
pkg/db/db.go
16
pkg/db/db.go
@ -15,6 +15,7 @@ type Job struct {
|
|||||||
URL string
|
URL string
|
||||||
Selector string
|
Selector string
|
||||||
ExtractedContent string
|
ExtractedContent string
|
||||||
|
RawHTML string
|
||||||
}
|
}
|
||||||
|
|
||||||
func Init(dbPath string) (*sql.DB, error) {
|
func Init(dbPath string) (*sql.DB, error) {
|
||||||
@ -31,7 +32,8 @@ func Init(dbPath string) (*sql.DB, error) {
|
|||||||
webhook_url TEXT,
|
webhook_url TEXT,
|
||||||
url TEXT,
|
url TEXT,
|
||||||
selector TEXT,
|
selector TEXT,
|
||||||
extracted_content TEXT
|
extracted_content TEXT,
|
||||||
|
raw_html TEXT
|
||||||
);`
|
);`
|
||||||
|
|
||||||
_, err = db.Exec(schema)
|
_, err = db.Exec(schema)
|
||||||
@ -51,7 +53,7 @@ func CreateJob(db *sql.DB, webhookURL, url, selector string) (int64, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func GetPendingJobs(db *sql.DB) ([]Job, error) {
|
func GetPendingJobs(db *sql.DB) ([]Job, error) {
|
||||||
rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, selector, extracted_content FROM jobs WHERE status = 'pending'")
|
rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, selector, extracted_content, raw_html FROM jobs WHERE status = 'pending'")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -60,14 +62,15 @@ func GetPendingJobs(db *sql.DB) ([]Job, error) {
|
|||||||
var jobs []Job
|
var jobs []Job
|
||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
var job Job
|
var job Job
|
||||||
var webhookURL, url, selector, extractedContent sql.NullString
|
var webhookURL, url, selector, extractedContent, rawHTML sql.NullString
|
||||||
if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &selector, &extractedContent); err != nil {
|
if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &selector, &extractedContent, &rawHTML); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
job.WebhookURL = webhookURL.String
|
job.WebhookURL = webhookURL.String
|
||||||
job.URL = url.String
|
job.URL = url.String
|
||||||
job.Selector = selector.String
|
job.Selector = selector.String
|
||||||
job.ExtractedContent = extractedContent.String
|
job.ExtractedContent = extractedContent.String
|
||||||
|
job.RawHTML = rawHTML.String
|
||||||
jobs = append(jobs, job)
|
jobs = append(jobs, job)
|
||||||
}
|
}
|
||||||
return jobs, nil
|
return jobs, nil
|
||||||
@ -78,6 +81,11 @@ func UpdateJobContent(db *sql.DB, id int, content string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func UpdateJobHTML(db *sql.DB, id int, html string) error {
|
||||||
|
_, err := db.Exec("UPDATE jobs SET raw_html = ? WHERE id = ?", html, id)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
func MarkJobDone(db *sql.DB, id int) error {
|
func MarkJobDone(db *sql.DB, id int) error {
|
||||||
_, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id)
|
_, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id)
|
||||||
return err
|
return err
|
||||||
|
|||||||
@ -118,11 +118,25 @@ func Start(database *sql.DB) {
|
|||||||
status = "error"
|
status = "error"
|
||||||
} else {
|
} else {
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
content = extractContent(resp.Body, job.Selector)
|
|
||||||
log.Printf("Job %d: Extracted content (%d chars)", job.ID, len(content))
|
|
||||||
|
|
||||||
if err := db.UpdateJobContent(database, job.ID, content); err != nil {
|
htmlBytes, err := io.ReadAll(resp.Body)
|
||||||
log.Printf("Job %d: Error updating content: %v", job.ID, err)
|
if err != nil {
|
||||||
|
log.Printf("Job %d: Error reading response: %v", job.ID, err)
|
||||||
|
status = "error"
|
||||||
|
} else {
|
||||||
|
rawHTML := string(htmlBytes)
|
||||||
|
log.Printf("Job %d: Saved HTML (%d bytes)", job.ID, len(rawHTML))
|
||||||
|
|
||||||
|
if err := db.UpdateJobHTML(database, job.ID, rawHTML); err != nil {
|
||||||
|
log.Printf("Job %d: Error updating HTML: %v", job.ID, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
content = extractContent(bytes.NewReader(htmlBytes), job.Selector)
|
||||||
|
log.Printf("Job %d: Extracted content (%d chars)", job.ID, len(content))
|
||||||
|
|
||||||
|
if err := db.UpdateJobContent(database, job.ID, content); err != nil {
|
||||||
|
log.Printf("Job %d: Error updating content: %v", job.ID, err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user