Replace hardcoded title extraction with user-defined CSS selectors using goquery. Users specify selector in job JSON to extract any HTML elements. Worker extracts text content plus src/href attributes. Webhook payload includes extracted content and URL.
163 lines
3.7 KiB
Go
163 lines
3.7 KiB
Go
package worker
|
|
|
|
import (
|
|
"alpenqueue/pkg/db"
|
|
"bytes"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/temoto/robotstxt"
|
|
)
|
|
|
|
const userAgent = "AlpenQueue/1.0 (+https://github.com/yourusername/alpenqueue)"
|
|
|
|
func extractContent(body io.Reader, selector string) string {
|
|
if selector == "" {
|
|
selector = "title"
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(body)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
var results []string
|
|
doc.Find(selector).Each(func(i int, s *goquery.Selection) {
|
|
text := strings.TrimSpace(s.Text())
|
|
if text != "" {
|
|
results = append(results, text)
|
|
}
|
|
|
|
if src, exists := s.Attr("src"); exists {
|
|
results = append(results, src)
|
|
}
|
|
|
|
if href, exists := s.Attr("href"); exists {
|
|
results = append(results, href)
|
|
}
|
|
})
|
|
|
|
return strings.Join(results, "\n")
|
|
}
|
|
|
|
func checkRobotsTxt(targetURL string) bool {
|
|
parsedURL, err := url.Parse(targetURL)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host)
|
|
client := &http.Client{Timeout: 10 * time.Second}
|
|
req, _ := http.NewRequest("GET", robotsURL, nil)
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return true
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode == 404 {
|
|
return true
|
|
}
|
|
|
|
robotsData, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return true
|
|
}
|
|
|
|
robots, err := robotstxt.FromBytes(robotsData)
|
|
if err != nil {
|
|
return true
|
|
}
|
|
|
|
return robots.TestAgent(parsedURL.Path, userAgent)
|
|
}
|
|
|
|
func Start(database *sql.DB) {
|
|
go func() {
|
|
for {
|
|
jobs, err := db.GetPendingJobs(database)
|
|
if err != nil {
|
|
log.Printf("Error fetching jobs: %v", err)
|
|
time.Sleep(5 * time.Second)
|
|
continue
|
|
}
|
|
|
|
for _, job := range jobs {
|
|
log.Printf("Processing job %d: %s (selector: %s)", job.ID, job.URL, job.Selector)
|
|
start := time.Now()
|
|
|
|
content := ""
|
|
status := "ok"
|
|
|
|
if job.URL != "" {
|
|
if !checkRobotsTxt(job.URL) {
|
|
log.Printf("Job %d: Blocked by robots.txt", job.ID)
|
|
status = "blocked"
|
|
} else {
|
|
client := &http.Client{Timeout: 30 * time.Second}
|
|
req, err := http.NewRequest("GET", job.URL, nil)
|
|
if err != nil {
|
|
log.Printf("Job %d: Error creating request: %v", job.ID, err)
|
|
status = "error"
|
|
} else {
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
log.Printf("Job %d: Error fetching URL: %v", job.ID, err)
|
|
status = "error"
|
|
} else {
|
|
defer resp.Body.Close()
|
|
content = extractContent(resp.Body, job.Selector)
|
|
log.Printf("Job %d: Extracted content (%d chars)", job.ID, len(content))
|
|
|
|
if err := db.UpdateJobContent(database, job.ID, content); err != nil {
|
|
log.Printf("Job %d: Error updating content: %v", job.ID, err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
duration := time.Since(start)
|
|
|
|
if job.WebhookURL != "" {
|
|
payload := map[string]string{
|
|
"status": status,
|
|
"took": fmt.Sprintf("%.1fs", duration.Seconds()),
|
|
"url": job.URL,
|
|
"content": content,
|
|
}
|
|
jsonData, _ := json.Marshal(payload)
|
|
|
|
resp, err := http.Post(job.WebhookURL, "application/json", bytes.NewBuffer(jsonData))
|
|
if err != nil {
|
|
log.Printf("Error posting webhook for job %d: %v", job.ID, err)
|
|
} else {
|
|
resp.Body.Close()
|
|
log.Printf("Webhook posted for job %d", job.ID)
|
|
}
|
|
}
|
|
|
|
if err := db.MarkJobDone(database, job.ID); err != nil {
|
|
log.Printf("Error marking job %d done: %v", job.ID, err)
|
|
} else {
|
|
log.Printf("Job %d completed", job.ID)
|
|
}
|
|
}
|
|
|
|
time.Sleep(5 * time.Second)
|
|
}
|
|
}()
|
|
}
|