Store complete HTML response in raw_html column before extraction. Enables re-running selectors on historical scrapes when sites change their DOM structure or CSS classes.
93 lines
2.2 KiB
Go
93 lines
2.2 KiB
Go
package db
|
|
|
|
import (
|
|
"database/sql"
|
|
"time"
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
)
|
|
|
|
type Job struct {
|
|
ID int
|
|
Status string
|
|
CreatedAt time.Time
|
|
WebhookURL string
|
|
URL string
|
|
Selector string
|
|
ExtractedContent string
|
|
RawHTML string
|
|
}
|
|
|
|
func Init(dbPath string) (*sql.DB, error) {
|
|
db, err := sql.Open("sqlite3", dbPath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
schema := `
|
|
CREATE TABLE IF NOT EXISTS jobs (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
status TEXT NOT NULL DEFAULT 'pending',
|
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
webhook_url TEXT,
|
|
url TEXT,
|
|
selector TEXT,
|
|
extracted_content TEXT,
|
|
raw_html TEXT
|
|
);`
|
|
|
|
_, err = db.Exec(schema)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return db, nil
|
|
}
|
|
|
|
func CreateJob(db *sql.DB, webhookURL, url, selector string) (int64, error) {
|
|
result, err := db.Exec("INSERT INTO jobs (status, webhook_url, url, selector) VALUES ('pending', ?, ?, ?)", webhookURL, url, selector)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return result.LastInsertId()
|
|
}
|
|
|
|
func GetPendingJobs(db *sql.DB) ([]Job, error) {
|
|
rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, selector, extracted_content, raw_html FROM jobs WHERE status = 'pending'")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
var jobs []Job
|
|
for rows.Next() {
|
|
var job Job
|
|
var webhookURL, url, selector, extractedContent, rawHTML sql.NullString
|
|
if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &selector, &extractedContent, &rawHTML); err != nil {
|
|
return nil, err
|
|
}
|
|
job.WebhookURL = webhookURL.String
|
|
job.URL = url.String
|
|
job.Selector = selector.String
|
|
job.ExtractedContent = extractedContent.String
|
|
job.RawHTML = rawHTML.String
|
|
jobs = append(jobs, job)
|
|
}
|
|
return jobs, nil
|
|
}
|
|
|
|
func UpdateJobContent(db *sql.DB, id int, content string) error {
|
|
_, err := db.Exec("UPDATE jobs SET extracted_content = ? WHERE id = ?", content, id)
|
|
return err
|
|
}
|
|
|
|
func UpdateJobHTML(db *sql.DB, id int, html string) error {
|
|
_, err := db.Exec("UPDATE jobs SET raw_html = ? WHERE id = ?", html, id)
|
|
return err
|
|
}
|
|
|
|
func MarkJobDone(db *sql.DB, id int) error {
|
|
_, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id)
|
|
return err
|
|
}
|