Rider, a simple Ruby Web spider
Code and specs at Github: http://github.com/sqs/rider/
git clone git://github.com/sqs/rider.git
Example
require 'PATH_TO_RIDER/lib/rider'
# using the host-partitioned queue means each request will alternate among hosts
queue = Rider::HostPartitionedQueue.unserialize('web') || Rider::HostPartitionedQueue.new('web')
nytimes_mask = /^http:\/\/(www\.)?nytimes.com/
crawler = Rider::Crawler.new(nytimes_mask, queue)
queue.push('http://nytimes.com/')
crawler.each_document do |uri, metadata, contents|
puts "* #{uri.to_s}: #{contents[0..100]}..."
# do something with uri, metadata, and contents
# put the links that Rider should follow (taken from the current document) at the end of the block
# they are added to the queue
some_document.follow_uris
end