Unfortunately, you can't run this in the browser here
This program makes HTTP requests and writes to the filesystem, so it must be run locally — it cannot run in the browser.
Run go run main.go to start the spider.
Description
View the source code
package main
import (
"database/sql"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
"time"
_ "modernc.org/sqlite"
)
func main() {
db, err := sql.Open("sqlite", "magellan.sq3")
if err != nil {
fmt.Println("Error opening database:", err)
return
}
defer db.Close()
db.Exec(`CREATE VIRTUAL TABLE IF NOT EXISTS pages USING fts5(url
UNINDEXED, title, snippet)`)
data, err := os.ReadFile("starter_websites.json")
if err != nil {
fmt.Println("Error loading starter_websites.json:",
err)
return
}
var starters struct {
Websites []string `json:"websites"`
}
if err := json.Unmarshal(data, &starters); err != nil {
fmt.Println("Error parsing starter_websites.json:",
err)
return
}
queue := append([]string{}, starters.Websites...)
searched := map[string]bool{}
for len(queue) > 0 {
pageURL := queue[0]
queue = queue[1:]
if searched[pageURL] {
continue
}
resp, err := http.Get(pageURL)
if err != nil {
searched[pageURL] = true
continue
}
body, err := io.ReadAll(resp.Body)
if err != nil {
searched[pageURL] = true
resp.Body.Close()
continue
}
lines := strings.Split(string(body), "\n")
for _, line := range lines {
if strings.Contains(line, "href=\"") {
start := strings.Index(line, "href=\"") +
len("href=\"")
relEnd := strings.Index(line[start:], "\"")
if relEnd == -1 {
continue
}
end := relEnd + start
link := line[start:end]
base, _ := url.Parse(pageURL)
parsed, err := base.Parse(link)
if err == nil && (parsed.Scheme ==
"http" || parsed.Scheme == "https") {
queue = append(queue,
parsed.String())
}
}
if strings.Contains(line, "<title>") {
start := strings.Index(line, "<title>")
+ len("<title>")
relEnd := strings.Index(line[start:],
"</title>")
if relEnd == -1 {
continue
}
end := relEnd + start
title := line[start:end]
if title != "" {
db.Exec(`INSERT INTO pages (url, title,
snippet) VALUES (?, ?, ?)`, pageURL, title, "")
fmt.Printf("Saved: %s — %s\n", pageURL,
title)
}
}
}
fmt.Printf("Parsed %v\n", pageURL)
searched[pageURL] = true
resp.Body.Close()
time.Sleep(500 * time.Millisecond)
}
fmt.Println("Done.")
}
Previous Day
Next Day