AlpenQueue/pkg/worker/worker.go
Soldier 1ce45cfe97 Add URL scraping with ethical web crawling
Replace sleep with actual URL fetching. Worker scrapes HTML title from URLs, respects robots.txt, and includes proper User-Agent headers. Scraped titles stored in SQLite and sent via webhook callback.
2025-11-16 08:18:31 +00:00

148 lines
3.3 KiB
Go

package worker
import (
"alpenqueue/pkg/db"
"bytes"
"database/sql"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"net/url"
"strings"
"time"
"github.com/temoto/robotstxt"
"golang.org/x/net/html"
)
const userAgent = "AlpenQueue/1.0 (+https://github.com/yourusername/alpenqueue)"
func extractTitle(body io.Reader) string {
tokenizer := html.NewTokenizer(body)
for {
tokenType := tokenizer.Next()
if tokenType == html.ErrorToken {
return ""
}
token := tokenizer.Token()
if tokenType == html.StartTagToken && token.Data == "title" {
tokenizer.Next()
return strings.TrimSpace(tokenizer.Token().Data)
}
}
}
func checkRobotsTxt(targetURL string) bool {
parsedURL, err := url.Parse(targetURL)
if err != nil {
return false
}
robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host)
client := &http.Client{Timeout: 10 * time.Second}
req, _ := http.NewRequest("GET", robotsURL, nil)
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
return true
}
defer resp.Body.Close()
if resp.StatusCode == 404 {
return true
}
robotsData, err := io.ReadAll(resp.Body)
if err != nil {
return true
}
robots, err := robotstxt.FromBytes(robotsData)
if err != nil {
return true
}
return robots.TestAgent(parsedURL.Path, userAgent)
}
func Start(database *sql.DB) {
go func() {
for {
jobs, err := db.GetPendingJobs(database)
if err != nil {
log.Printf("Error fetching jobs: %v", err)
time.Sleep(5 * time.Second)
continue
}
for _, job := range jobs {
log.Printf("Processing job %d: %s", job.ID, job.URL)
start := time.Now()
title := ""
status := "ok"
if job.URL != "" {
if !checkRobotsTxt(job.URL) {
log.Printf("Job %d: Blocked by robots.txt", job.ID)
status = "blocked"
} else {
client := &http.Client{Timeout: 30 * time.Second}
req, err := http.NewRequest("GET", job.URL, nil)
if err != nil {
log.Printf("Job %d: Error creating request: %v", job.ID, err)
status = "error"
} else {
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
log.Printf("Job %d: Error fetching URL: %v", job.ID, err)
status = "error"
} else {
defer resp.Body.Close()
title = extractTitle(resp.Body)
log.Printf("Job %d: Extracted title: %s", job.ID, title)
if err := db.UpdateJobTitle(database, job.ID, title); err != nil {
log.Printf("Job %d: Error updating title: %v", job.ID, err)
}
}
}
}
}
duration := time.Since(start)
if job.WebhookURL != "" {
payload := map[string]string{
"status": status,
"took": fmt.Sprintf("%.1fs", duration.Seconds()),
"title": title,
}
jsonData, _ := json.Marshal(payload)
resp, err := http.Post(job.WebhookURL, "application/json", bytes.NewBuffer(jsonData))
if err != nil {
log.Printf("Error posting webhook for job %d: %v", job.ID, err)
} else {
resp.Body.Close()
log.Printf("Webhook posted for job %d", job.ID)
}
}
if err := db.MarkJobDone(database, job.ID); err != nil {
log.Printf("Error marking job %d done: %v", job.ID, err)
} else {
log.Printf("Job %d completed", job.ID)
}
}
time.Sleep(5 * time.Second)
}
}()
}