Add URL scraping with ethical web crawling

Replace sleep with actual URL fetching. Worker scrapes HTML title from URLs, respects robots.txt, and includes proper User-Agent headers. Scraped titles stored in SQLite and sent via webhook callback.
This commit is contained in:
Soldier 2025-11-16 08:18:31 +00:00
parent 018d699e31
commit 1ce45cfe97
5 changed files with 125 additions and 10 deletions

View File

@ -31,12 +31,13 @@ func main() {
var req struct { var req struct {
WebhookURL string `json:"webhook_url"` WebhookURL string `json:"webhook_url"`
URL string `json:"url"`
} }
body, _ := io.ReadAll(r.Body) body, _ := io.ReadAll(r.Body)
json.Unmarshal(body, &req) json.Unmarshal(body, &req)
id, err := db.CreateJob(database, req.WebhookURL) id, err := db.CreateJob(database, req.WebhookURL, req.URL)
if err != nil { if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError) http.Error(w, err.Error(), http.StatusInternalServerError)
return return

6
go.mod
View File

@ -2,4 +2,8 @@ module alpenqueue
go 1.25.4 go 1.25.4
require github.com/mattn/go-sqlite3 v1.14.32 // indirect require (
github.com/mattn/go-sqlite3 v1.14.32 // indirect
github.com/temoto/robotstxt v1.1.2 // indirect
golang.org/x/net v0.47.0 // indirect
)

8
go.sum
View File

@ -1,2 +1,10 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs= github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs=
github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=

View File

@ -12,6 +12,8 @@ type Job struct {
Status string Status string
CreatedAt time.Time CreatedAt time.Time
WebhookURL string WebhookURL string
URL string
Title string
} }
func Init(dbPath string) (*sql.DB, error) { func Init(dbPath string) (*sql.DB, error) {
@ -25,7 +27,9 @@ func Init(dbPath string) (*sql.DB, error) {
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
status TEXT NOT NULL DEFAULT 'pending', status TEXT NOT NULL DEFAULT 'pending',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP, created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
webhook_url TEXT webhook_url TEXT,
url TEXT,
title TEXT
);` );`
_, err = db.Exec(schema) _, err = db.Exec(schema)
@ -36,8 +40,8 @@ func Init(dbPath string) (*sql.DB, error) {
return db, nil return db, nil
} }
func CreateJob(db *sql.DB, webhookURL string) (int64, error) { func CreateJob(db *sql.DB, webhookURL, url string) (int64, error) {
result, err := db.Exec("INSERT INTO jobs (status, webhook_url) VALUES ('pending', ?)", webhookURL) result, err := db.Exec("INSERT INTO jobs (status, webhook_url, url) VALUES ('pending', ?, ?)", webhookURL, url)
if err != nil { if err != nil {
return 0, err return 0, err
} }
@ -45,7 +49,7 @@ func CreateJob(db *sql.DB, webhookURL string) (int64, error) {
} }
func GetPendingJobs(db *sql.DB) ([]Job, error) { func GetPendingJobs(db *sql.DB) ([]Job, error) {
rows, err := db.Query("SELECT id, status, created_at, webhook_url FROM jobs WHERE status = 'pending'") rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, title FROM jobs WHERE status = 'pending'")
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -54,14 +58,23 @@ func GetPendingJobs(db *sql.DB) ([]Job, error) {
var jobs []Job var jobs []Job
for rows.Next() { for rows.Next() {
var job Job var job Job
if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &job.WebhookURL); err != nil { var webhookURL, url, title sql.NullString
if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &title); err != nil {
return nil, err return nil, err
} }
job.WebhookURL = webhookURL.String
job.URL = url.String
job.Title = title.String
jobs = append(jobs, job) jobs = append(jobs, job)
} }
return jobs, nil return jobs, nil
} }
func UpdateJobTitle(db *sql.DB, id int, title string) error {
_, err := db.Exec("UPDATE jobs SET title = ? WHERE id = ?", title, id)
return err
}
func MarkJobDone(db *sql.DB, id int) error { func MarkJobDone(db *sql.DB, id int) error {
_, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id) _, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id)
return err return err

View File

@ -6,11 +6,68 @@ import (
"database/sql" "database/sql"
"encoding/json" "encoding/json"
"fmt" "fmt"
"io"
"log" "log"
"net/http" "net/http"
"net/url"
"strings"
"time" "time"
"github.com/temoto/robotstxt"
"golang.org/x/net/html"
) )
const userAgent = "AlpenQueue/1.0 (+https://github.com/yourusername/alpenqueue)"
func extractTitle(body io.Reader) string {
tokenizer := html.NewTokenizer(body)
for {
tokenType := tokenizer.Next()
if tokenType == html.ErrorToken {
return ""
}
token := tokenizer.Token()
if tokenType == html.StartTagToken && token.Data == "title" {
tokenizer.Next()
return strings.TrimSpace(tokenizer.Token().Data)
}
}
}
func checkRobotsTxt(targetURL string) bool {
parsedURL, err := url.Parse(targetURL)
if err != nil {
return false
}
robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host)
client := &http.Client{Timeout: 10 * time.Second}
req, _ := http.NewRequest("GET", robotsURL, nil)
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
return true
}
defer resp.Body.Close()
if resp.StatusCode == 404 {
return true
}
robotsData, err := io.ReadAll(resp.Body)
if err != nil {
return true
}
robots, err := robotstxt.FromBytes(robotsData)
if err != nil {
return true
}
return robots.TestAgent(parsedURL.Path, userAgent)
}
func Start(database *sql.DB) { func Start(database *sql.DB) {
go func() { go func() {
for { for {
@ -22,17 +79,49 @@ func Start(database *sql.DB) {
} }
for _, job := range jobs { for _, job := range jobs {
log.Printf("Processing job %d", job.ID) log.Printf("Processing job %d: %s", job.ID, job.URL)
start := time.Now() start := time.Now()
time.Sleep(2 * time.Second) title := ""
status := "ok"
if job.URL != "" {
if !checkRobotsTxt(job.URL) {
log.Printf("Job %d: Blocked by robots.txt", job.ID)
status = "blocked"
} else {
client := &http.Client{Timeout: 30 * time.Second}
req, err := http.NewRequest("GET", job.URL, nil)
if err != nil {
log.Printf("Job %d: Error creating request: %v", job.ID, err)
status = "error"
} else {
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
log.Printf("Job %d: Error fetching URL: %v", job.ID, err)
status = "error"
} else {
defer resp.Body.Close()
title = extractTitle(resp.Body)
log.Printf("Job %d: Extracted title: %s", job.ID, title)
if err := db.UpdateJobTitle(database, job.ID, title); err != nil {
log.Printf("Job %d: Error updating title: %v", job.ID, err)
}
}
}
}
}
duration := time.Since(start) duration := time.Since(start)
if job.WebhookURL != "" { if job.WebhookURL != "" {
payload := map[string]string{ payload := map[string]string{
"status": "ok", "status": status,
"took": fmt.Sprintf("%.1fs", duration.Seconds()), "took": fmt.Sprintf("%.1fs", duration.Seconds()),
"title": title,
} }
jsonData, _ := json.Marshal(payload) jsonData, _ := json.Marshal(payload)