Add raw HTML archiving for historical re-parsing
Store complete HTML response in raw_html column before extraction. Enables re-running selectors on historical scrapes when sites change their DOM structure or CSS classes.
This commit is contained in:
parent
405f9ca173
commit
985d340855
16
pkg/db/db.go
16
pkg/db/db.go
@ -15,6 +15,7 @@ type Job struct {
|
||||
URL string
|
||||
Selector string
|
||||
ExtractedContent string
|
||||
RawHTML string
|
||||
}
|
||||
|
||||
func Init(dbPath string) (*sql.DB, error) {
|
||||
@ -31,7 +32,8 @@ func Init(dbPath string) (*sql.DB, error) {
|
||||
webhook_url TEXT,
|
||||
url TEXT,
|
||||
selector TEXT,
|
||||
extracted_content TEXT
|
||||
extracted_content TEXT,
|
||||
raw_html TEXT
|
||||
);`
|
||||
|
||||
_, err = db.Exec(schema)
|
||||
@ -51,7 +53,7 @@ func CreateJob(db *sql.DB, webhookURL, url, selector string) (int64, error) {
|
||||
}
|
||||
|
||||
func GetPendingJobs(db *sql.DB) ([]Job, error) {
|
||||
rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, selector, extracted_content FROM jobs WHERE status = 'pending'")
|
||||
rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, selector, extracted_content, raw_html FROM jobs WHERE status = 'pending'")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -60,14 +62,15 @@ func GetPendingJobs(db *sql.DB) ([]Job, error) {
|
||||
var jobs []Job
|
||||
for rows.Next() {
|
||||
var job Job
|
||||
var webhookURL, url, selector, extractedContent sql.NullString
|
||||
if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &selector, &extractedContent); err != nil {
|
||||
var webhookURL, url, selector, extractedContent, rawHTML sql.NullString
|
||||
if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &selector, &extractedContent, &rawHTML); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
job.WebhookURL = webhookURL.String
|
||||
job.URL = url.String
|
||||
job.Selector = selector.String
|
||||
job.ExtractedContent = extractedContent.String
|
||||
job.RawHTML = rawHTML.String
|
||||
jobs = append(jobs, job)
|
||||
}
|
||||
return jobs, nil
|
||||
@ -78,6 +81,11 @@ func UpdateJobContent(db *sql.DB, id int, content string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func UpdateJobHTML(db *sql.DB, id int, html string) error {
|
||||
_, err := db.Exec("UPDATE jobs SET raw_html = ? WHERE id = ?", html, id)
|
||||
return err
|
||||
}
|
||||
|
||||
func MarkJobDone(db *sql.DB, id int) error {
|
||||
_, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id)
|
||||
return err
|
||||
|
||||
@ -118,7 +118,20 @@ func Start(database *sql.DB) {
|
||||
status = "error"
|
||||
} else {
|
||||
defer resp.Body.Close()
|
||||
content = extractContent(resp.Body, job.Selector)
|
||||
|
||||
htmlBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Printf("Job %d: Error reading response: %v", job.ID, err)
|
||||
status = "error"
|
||||
} else {
|
||||
rawHTML := string(htmlBytes)
|
||||
log.Printf("Job %d: Saved HTML (%d bytes)", job.ID, len(rawHTML))
|
||||
|
||||
if err := db.UpdateJobHTML(database, job.ID, rawHTML); err != nil {
|
||||
log.Printf("Job %d: Error updating HTML: %v", job.ID, err)
|
||||
}
|
||||
|
||||
content = extractContent(bytes.NewReader(htmlBytes), job.Selector)
|
||||
log.Printf("Job %d: Extracted content (%d chars)", job.ID, len(content))
|
||||
|
||||
if err := db.UpdateJobContent(database, job.ID, content); err != nil {
|
||||
@ -128,6 +141,7 @@ func Start(database *sql.DB) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
duration := time.Since(start)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user