?️ ActoCrawler

ActoCrawler is a Swift Concurrency-powered crawler engine on top of Actomaton, with flexible customizability to create various HTML scrapers, image scrapers, etc.

Example

struct Output: Sendable
{
    let nextLinksCount: Int
}

let htmlCrawler = await Crawler<Output, Void>.htmlScraper(
    config: CrawlerConfig(
        maxDepths: 10,
        maxTotalRequests: 100,
        timeoutPerRequest: 5,
        userAgent: "ActoCrawler",
        domainFilteringPolicy: .disallowedDomains([".*google.com*" /* ... */]),
        domainQueueTable: [
            ".*example1.com*": .init(maxConcurrency: 1, delay: 0),
            ".*example2.com*": .init(maxConcurrency: 5, delay: 0.1 ... 0.5)
        ]
    ),
    scrapeHTML: { response in
        let html = response.data
        let links = try html.select("a").map { try $0.attr("href") }

        let nextRequests = links
            .compactMap(URL.init(string:))
            .map { UserRequest(url: $0) }

        return (nextRequests, Output(nextLinksCount: nextRequests.count))
    }
)

// Visit initial page.
htmlCrawler.visit(url: URL(string: "https://www.wikipedia.org")!)

// Observe crawl events.
for await event in htmlCrawler.events {
    switch event {
    case let .willCrawl(req):
        print("Crawl : ?️ [\(req.order)] [d=\(req.depth)] \(req.url)")
    case let .didCrawl(req, .success(output)):
        print("Output: ✅ [\(req.order)] [d=\(req.depth)] \(req.url), nextLinksCount = \(output.nextLinksCount)")
    case let .didCrawl(req, .failure(error)):
        print("Output: ❌ [\(req.order)] [d=\(req.depth)] \(req.url), error = \(error)")
    }
}

print("Output Done")

Acknowledgements

License

MIT

GitHub

View Github