Day 13

A Go Web Spider for Magellan

Unfortunately, you can't run this in the browser here

This program makes HTTP requests and writes to the filesystem, so it must be run locally — it cannot run in the browser.

Run go run main.go to start the spider.


Description


This is a Go web spider built as the backend crawler for the Magellan search engine. It starts from a seed URL, follows links found on each page, and indexes the content it discovers — building the data that Magellan searches through.

View the source code


package main
import (
 "database/sql"
 "encoding/json"
 "fmt"
 "io"
 "net/http"
 "net/url"
 "os"
 "strings"
 "time"
 _ "modernc.org/sqlite"
)
func main() {
 db, err := sql.Open("sqlite", "magellan.sq3")
 if err != nil {
  fmt.Println("Error opening database:", err)
  return
 }
 defer db.Close()
 db.Exec(`CREATE VIRTUAL TABLE IF NOT EXISTS pages USING fts5(url UNINDEXED, title, snippet)`)
 data, err := os.ReadFile("starter_websites.json")
 if err != nil {
  fmt.Println("Error loading starter_websites.json:", err)
  return
 }
 var starters struct {
  Websites []string `json:"websites"`
 }
 if err := json.Unmarshal(data, &starters); err != nil {
  fmt.Println("Error parsing starter_websites.json:", err)
  return
 }
 queue := append([]string{}, starters.Websites...)
 searched := map[string]bool{}
 for len(queue) > 0 {
  pageURL := queue[0]
  queue = queue[1:]
  if searched[pageURL] {
   continue
  }
  resp, err := http.Get(pageURL)
  if err != nil {
   searched[pageURL] = true
   continue
  }
  body, err := io.ReadAll(resp.Body)
  if err != nil {
   searched[pageURL] = true
   resp.Body.Close()
   continue
  }
  lines := strings.Split(string(body), "\n")
  for _, line := range lines {
   if strings.Contains(line, "href=\"") {
    start := strings.Index(line, "href=\"") + len("href=\"")
    relEnd := strings.Index(line[start:], "\"")
    if relEnd == -1 {
     continue
    }
    end := relEnd + start
    link := line[start:end]
    base, _ := url.Parse(pageURL)
    parsed, err := base.Parse(link)
    if err == nil && (parsed.Scheme == "http" || parsed.Scheme == "https") {
     queue = append(queue, parsed.String())
    }
   }
   if strings.Contains(line, "<title>") {
    start := strings.Index(line, "<title>") + len("<title>")
    relEnd := strings.Index(line[start:], "</title>")
    if relEnd == -1 {
     continue
    }
    end := relEnd + start
    title := line[start:end]
    if title != "" {
     db.Exec(`INSERT INTO pages (url, title, snippet) VALUES (?, ?, ?)`, pageURL, title, "")
     fmt.Printf("Saved: %s — %s\n", pageURL, title)
    }
   }
  }
  fmt.Printf("Parsed %v\n", pageURL)
  searched[pageURL] = true
  resp.Body.Close()
  time.Sleep(500 * time.Millisecond)
 }
 fmt.Println("Done.")
}

Previous Day
Next Day