AlpenQueue/pkg/db/db.go
Soldier 1ce45cfe97 Add URL scraping with ethical web crawling
Replace sleep with actual URL fetching. Worker scrapes HTML title from URLs, respects robots.txt, and includes proper User-Agent headers. Scraped titles stored in SQLite and sent via webhook callback.
2025-11-16 08:18:31 +00:00

82 lines
1.7 KiB
Go

package db
import (
"database/sql"
"time"
_ "github.com/mattn/go-sqlite3"
)
type Job struct {
ID int
Status string
CreatedAt time.Time
WebhookURL string
URL string
Title string
}
func Init(dbPath string) (*sql.DB, error) {
db, err := sql.Open("sqlite3", dbPath)
if err != nil {
return nil, err
}
schema := `
CREATE TABLE IF NOT EXISTS jobs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
status TEXT NOT NULL DEFAULT 'pending',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
webhook_url TEXT,
url TEXT,
title TEXT
);`
_, err = db.Exec(schema)
if err != nil {
return nil, err
}
return db, nil
}
func CreateJob(db *sql.DB, webhookURL, url string) (int64, error) {
result, err := db.Exec("INSERT INTO jobs (status, webhook_url, url) VALUES ('pending', ?, ?)", webhookURL, url)
if err != nil {
return 0, err
}
return result.LastInsertId()
}
func GetPendingJobs(db *sql.DB) ([]Job, error) {
rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, title FROM jobs WHERE status = 'pending'")
if err != nil {
return nil, err
}
defer rows.Close()
var jobs []Job
for rows.Next() {
var job Job
var webhookURL, url, title sql.NullString
if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &title); err != nil {
return nil, err
}
job.WebhookURL = webhookURL.String
job.URL = url.String
job.Title = title.String
jobs = append(jobs, job)
}
return jobs, nil
}
func UpdateJobTitle(db *sql.DB, id int, title string) error {
_, err := db.Exec("UPDATE jobs SET title = ? WHERE id = ?", title, id)
return err
}
func MarkJobDone(db *sql.DB, id int) error {
_, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id)
return err
}