Web Scraper
Complete source code for the concurrent web scraper capstone. Combines worker pools, rate limiting, context cancellation, and mutex-protected state.
package main
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
"sync"
"time"
"golang.org/x/net/html"
"golang.org/x/time/rate"
)
// --- URL Extraction ---
func extractLinks(body io.Reader, baseURL *url.URL) []string {
var links []string
tokenizer := html.NewTokenizer(body)
for {
tt := tokenizer.Next()
if tt == html.ErrorToken {
break
}
if tt == html.StartTagToken {
token := tokenizer.Token()
if token.Data != "a" {
continue
}
for _, attr := range token.Attr {
if attr.Key != "href" {
continue
}
link, err := baseURL.Parse(attr.Val)
if err != nil {
continue
}
if link.Host == baseURL.Host && strings.HasPrefix(link.Scheme, "http") {
link.Fragment = ""
links = append(links, link.String())
}
}
}
}
return links
}
// --- Fetcher ---
type FetchResult struct {
URL string
Links []string
Err error
}
func fetch(ctx context.Context, client *http.Client, rawURL string) FetchResult {
req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil)
if err != nil {
return FetchResult{URL: rawURL, Err: err}
}
req.Header.Set("User-Agent", "GoScraper/1.0")
resp, err := client.Do(req)
if err != nil {
return FetchResult{URL: rawURL, Err: err}
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return FetchResult{URL: rawURL, Err: fmt.Errorf("status %d", resp.StatusCode)}
}
baseURL, _ := url.Parse(rawURL)
links := extractLinks(resp.Body, baseURL)
return FetchResult{URL: rawURL, Links: links}
}
// --- Visited Tracker (goroutine-safe) ---
type Visited struct {
mu sync.Mutex
urls map[string]bool
}
func NewVisited() *Visited {
return &Visited{urls: make(map[string]bool)}
}
func (v *Visited) Add(u string) bool {
v.mu.Lock()
defer v.mu.Unlock()
if v.urls[u] {
return false
}
v.urls[u] = true
return true
}
func (v *Visited) Count() int {
v.mu.Lock()
defer v.mu.Unlock()
return len(v.urls)
}
// --- Worker ---
func worker(
ctx context.Context,
client *http.Client,
limiter *rate.Limiter,
jobs <-chan string,
results chan<- FetchResult,
wg *sync.WaitGroup,
) {
defer wg.Done()
for {
select {
case <-ctx.Done():
return
case u, ok := <-jobs:
if !ok {
return
}
if err := limiter.Wait(ctx); err != nil {
return
}
select {
case results <- fetch(ctx, client, u):
case <-ctx.Done():
return
}
}
}
}
// --- Scraper ---
type Scraper struct {
startURL string
maxPages int
workers int
rateLimit rate.Limit
timeout time.Duration
}
func (s *Scraper) Run() ([]string, error) {
ctx, cancel := context.WithTimeout(context.Background(), s.timeout)
defer cancel()
client := &http.Client{Timeout: 10 * time.Second}
limiter := rate.NewLimiter(s.rateLimit, int(s.rateLimit))
visited := NewVisited()
jobs := make(chan string, s.maxPages)
results := make(chan FetchResult, s.maxPages)
var wg sync.WaitGroup
for i := 0; i < s.workers; i++ {
wg.Add(1)
go worker(ctx, client, limiter, jobs, results, &wg)
}
go func() {
wg.Wait()
close(results)
}()
visited.Add(s.startURL)
jobs <- s.startURL
pending := 1
var discovered []string
for result := range results {
pending--
if result.Err != nil {
fmt.Printf("ERROR %s: %v\n", result.URL, result.Err)
} else {
fmt.Printf("OK %s (%d links)\n", result.URL, len(result.Links))
discovered = append(discovered, result.URL)
}
for _, link := range result.Links {
if visited.Count() >= s.maxPages {
break
}
if visited.Add(link) {
pending++
select {
case jobs <- link:
case <-ctx.Done():
close(jobs)
return discovered, ctx.Err()
}
}
}
if pending == 0 {
close(jobs)
break
}
}
return discovered, nil
}
// --- Main ---
func main() {
if len(os.Args) < 2 {
fmt.Println("usage: scraper <url>")
os.Exit(1)
}
scraper := &Scraper{
startURL: os.Args[1],
maxPages: 50,
workers: 5,
rateLimit: 2,
timeout: 30 * time.Second,
}
fmt.Printf("Scraping %s (max %d pages, %d workers, %.0f req/s)\n\n",
scraper.startURL, scraper.maxPages, scraper.workers, float64(scraper.rateLimit))
discovered, err := scraper.Run()
if err != nil {
fmt.Println("\nstopped:", err)
}
fmt.Printf("\n--- Results ---\n")
fmt.Printf("Pages scraped: %d\n", len(discovered))
for _, u := range discovered {
fmt.Println(" ", u)
}
}mkdir scraper && cd scraper
go mod init scraper
go get golang.org/x/time/rate
go get golang.org/x/net/html
# paste the code into main.go
go run . https://go.dev