package worker import ( "alpenqueue/pkg/db" "bytes" "database/sql" "encoding/json" "fmt" "io" "log" "net/http" "net/url" "strings" "time" "github.com/PuerkitoBio/goquery" "github.com/temoto/robotstxt" ) const userAgent = "AlpenQueue/1.0 (+https://github.com/yourusername/alpenqueue)" func extractContent(body io.Reader, selector string) string { if selector == "" { selector = "title" } doc, err := goquery.NewDocumentFromReader(body) if err != nil { return "" } var results []string doc.Find(selector).Each(func(i int, s *goquery.Selection) { text := strings.TrimSpace(s.Text()) if text != "" { results = append(results, text) } if src, exists := s.Attr("src"); exists { results = append(results, src) } if href, exists := s.Attr("href"); exists { results = append(results, href) } }) return strings.Join(results, "\n") } func checkRobotsTxt(targetURL string) bool { parsedURL, err := url.Parse(targetURL) if err != nil { return false } robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host) client := &http.Client{Timeout: 10 * time.Second} req, _ := http.NewRequest("GET", robotsURL, nil) req.Header.Set("User-Agent", userAgent) resp, err := client.Do(req) if err != nil { return true } defer resp.Body.Close() if resp.StatusCode == 404 { return true } robotsData, err := io.ReadAll(resp.Body) if err != nil { return true } robots, err := robotstxt.FromBytes(robotsData) if err != nil { return true } return robots.TestAgent(parsedURL.Path, userAgent) } func Start(database *sql.DB) { go func() { for { jobs, err := db.GetPendingJobs(database) if err != nil { log.Printf("Error fetching jobs: %v", err) time.Sleep(5 * time.Second) continue } for _, job := range jobs { log.Printf("Processing job %d: %s (selector: %s)", job.ID, job.URL, job.Selector) start := time.Now() content := "" status := "ok" if job.URL != "" { if !checkRobotsTxt(job.URL) { log.Printf("Job %d: Blocked by robots.txt", job.ID) status = "blocked" } else { client := &http.Client{Timeout: 30 * time.Second} req, err := http.NewRequest("GET", job.URL, nil) if err != nil { log.Printf("Job %d: Error creating request: %v", job.ID, err) status = "error" } else { req.Header.Set("User-Agent", userAgent) resp, err := client.Do(req) if err != nil { log.Printf("Job %d: Error fetching URL: %v", job.ID, err) status = "error" } else { defer resp.Body.Close() content = extractContent(resp.Body, job.Selector) log.Printf("Job %d: Extracted content (%d chars)", job.ID, len(content)) if err := db.UpdateJobContent(database, job.ID, content); err != nil { log.Printf("Job %d: Error updating content: %v", job.ID, err) } } } } } duration := time.Since(start) if job.WebhookURL != "" { payload := map[string]string{ "status": status, "took": fmt.Sprintf("%.1fs", duration.Seconds()), "url": job.URL, "content": content, } jsonData, _ := json.Marshal(payload) resp, err := http.Post(job.WebhookURL, "application/json", bytes.NewBuffer(jsonData)) if err != nil { log.Printf("Error posting webhook for job %d: %v", job.ID, err) } else { resp.Body.Close() log.Printf("Webhook posted for job %d", job.ID) } } if err := db.MarkJobDone(database, job.ID); err != nil { log.Printf("Error marking job %d done: %v", job.ID, err) } else { log.Printf("Job %d completed", job.ID) } } time.Sleep(5 * time.Second) } }() }