diff --git a/cmd/alpenqueue/main.go b/cmd/alpenqueue/main.go index 322be7c..4f96a9d 100644 --- a/cmd/alpenqueue/main.go +++ b/cmd/alpenqueue/main.go @@ -31,12 +31,13 @@ func main() { var req struct { WebhookURL string `json:"webhook_url"` + URL string `json:"url"` } body, _ := io.ReadAll(r.Body) json.Unmarshal(body, &req) - id, err := db.CreateJob(database, req.WebhookURL) + id, err := db.CreateJob(database, req.WebhookURL, req.URL) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return diff --git a/go.mod b/go.mod index 10bcc37..9e1624b 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,8 @@ module alpenqueue go 1.25.4 -require github.com/mattn/go-sqlite3 v1.14.32 // indirect +require ( + github.com/mattn/go-sqlite3 v1.14.32 // indirect + github.com/temoto/robotstxt v1.1.2 // indirect + golang.org/x/net v0.47.0 // indirect +) diff --git a/go.sum b/go.sum index 66f7516..ab72e85 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,10 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs= github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= diff --git a/pkg/db/db.go b/pkg/db/db.go index 5406b2d..c2f3a88 100644 --- a/pkg/db/db.go +++ b/pkg/db/db.go @@ -12,6 +12,8 @@ type Job struct { Status string CreatedAt time.Time WebhookURL string + URL string + Title string } func Init(dbPath string) (*sql.DB, error) { @@ -25,7 +27,9 @@ func Init(dbPath string) (*sql.DB, error) { id INTEGER PRIMARY KEY AUTOINCREMENT, status TEXT NOT NULL DEFAULT 'pending', created_at DATETIME DEFAULT CURRENT_TIMESTAMP, - webhook_url TEXT + webhook_url TEXT, + url TEXT, + title TEXT );` _, err = db.Exec(schema) @@ -36,8 +40,8 @@ func Init(dbPath string) (*sql.DB, error) { return db, nil } -func CreateJob(db *sql.DB, webhookURL string) (int64, error) { - result, err := db.Exec("INSERT INTO jobs (status, webhook_url) VALUES ('pending', ?)", webhookURL) +func CreateJob(db *sql.DB, webhookURL, url string) (int64, error) { + result, err := db.Exec("INSERT INTO jobs (status, webhook_url, url) VALUES ('pending', ?, ?)", webhookURL, url) if err != nil { return 0, err } @@ -45,7 +49,7 @@ func CreateJob(db *sql.DB, webhookURL string) (int64, error) { } func GetPendingJobs(db *sql.DB) ([]Job, error) { - rows, err := db.Query("SELECT id, status, created_at, webhook_url FROM jobs WHERE status = 'pending'") + rows, err := db.Query("SELECT id, status, created_at, webhook_url, url, title FROM jobs WHERE status = 'pending'") if err != nil { return nil, err } @@ -54,14 +58,23 @@ func GetPendingJobs(db *sql.DB) ([]Job, error) { var jobs []Job for rows.Next() { var job Job - if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &job.WebhookURL); err != nil { + var webhookURL, url, title sql.NullString + if err := rows.Scan(&job.ID, &job.Status, &job.CreatedAt, &webhookURL, &url, &title); err != nil { return nil, err } + job.WebhookURL = webhookURL.String + job.URL = url.String + job.Title = title.String jobs = append(jobs, job) } return jobs, nil } +func UpdateJobTitle(db *sql.DB, id int, title string) error { + _, err := db.Exec("UPDATE jobs SET title = ? WHERE id = ?", title, id) + return err +} + func MarkJobDone(db *sql.DB, id int) error { _, err := db.Exec("UPDATE jobs SET status = 'done' WHERE id = ?", id) return err diff --git a/pkg/worker/worker.go b/pkg/worker/worker.go index 74ac649..d1a73a2 100644 --- a/pkg/worker/worker.go +++ b/pkg/worker/worker.go @@ -6,11 +6,68 @@ import ( "database/sql" "encoding/json" "fmt" + "io" "log" "net/http" + "net/url" + "strings" "time" + + "github.com/temoto/robotstxt" + "golang.org/x/net/html" ) +const userAgent = "AlpenQueue/1.0 (+https://github.com/yourusername/alpenqueue)" + +func extractTitle(body io.Reader) string { + tokenizer := html.NewTokenizer(body) + for { + tokenType := tokenizer.Next() + if tokenType == html.ErrorToken { + return "" + } + token := tokenizer.Token() + if tokenType == html.StartTagToken && token.Data == "title" { + tokenizer.Next() + return strings.TrimSpace(tokenizer.Token().Data) + } + } +} + +func checkRobotsTxt(targetURL string) bool { + parsedURL, err := url.Parse(targetURL) + if err != nil { + return false + } + + robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host) + client := &http.Client{Timeout: 10 * time.Second} + req, _ := http.NewRequest("GET", robotsURL, nil) + req.Header.Set("User-Agent", userAgent) + + resp, err := client.Do(req) + if err != nil { + return true + } + defer resp.Body.Close() + + if resp.StatusCode == 404 { + return true + } + + robotsData, err := io.ReadAll(resp.Body) + if err != nil { + return true + } + + robots, err := robotstxt.FromBytes(robotsData) + if err != nil { + return true + } + + return robots.TestAgent(parsedURL.Path, userAgent) +} + func Start(database *sql.DB) { go func() { for { @@ -22,17 +79,49 @@ func Start(database *sql.DB) { } for _, job := range jobs { - log.Printf("Processing job %d", job.ID) + log.Printf("Processing job %d: %s", job.ID, job.URL) start := time.Now() - time.Sleep(2 * time.Second) + title := "" + status := "ok" + + if job.URL != "" { + if !checkRobotsTxt(job.URL) { + log.Printf("Job %d: Blocked by robots.txt", job.ID) + status = "blocked" + } else { + client := &http.Client{Timeout: 30 * time.Second} + req, err := http.NewRequest("GET", job.URL, nil) + if err != nil { + log.Printf("Job %d: Error creating request: %v", job.ID, err) + status = "error" + } else { + req.Header.Set("User-Agent", userAgent) + + resp, err := client.Do(req) + if err != nil { + log.Printf("Job %d: Error fetching URL: %v", job.ID, err) + status = "error" + } else { + defer resp.Body.Close() + title = extractTitle(resp.Body) + log.Printf("Job %d: Extracted title: %s", job.ID, title) + + if err := db.UpdateJobTitle(database, job.ID, title); err != nil { + log.Printf("Job %d: Error updating title: %v", job.ID, err) + } + } + } + } + } duration := time.Since(start) if job.WebhookURL != "" { payload := map[string]string{ - "status": "ok", + "status": status, "took": fmt.Sprintf("%.1fs", duration.Seconds()), + "title": title, } jsonData, _ := json.Marshal(payload)