rubyで2chクローラーを書いてみた(改訂)
キャッシュ機能追加
#!/usr/bin/ruby -KU -w require 'net/http' require 'kconv' def getfile(url) content='' begin res = Net::HTTP.get_response(URI.parse(url)) puts "#{res.code} #{url}" if res.code == '200' content=res.body.toutf8 end rescue Timeout::Error rescue SocketError end content end def scanhref(url,re) getfile(url).scan(re) end Dir.mkdir('./ima') if ! FileTest.directory?('./ima') Dir.mkdir('./kako') if ! FileTest.directory?('./kako') cache={} if FileTest.file?('./kako/cache_subject.txt') File.open('./kako/cache_subject.txt').each{|l| cache[l.chomp]=1} end scanhref( 'http://menu.2ch.net/bbstable.html', /<A HREF=(http\S+).*?>(.+?)<\/A>/ ).each do |url, title| puts title ufile=url[7..-2].tr('/','_') subject=File.open("ima/#{ufile}","w") scanhref( url+'subback.html', /<a href=\"(\S+)\".*?>\d+\:(.+?)<\/a>/ ).each {|u,t| subject.print "#{u}<>#{t}\n"} subject.close() scanhref( url+'kako/', /<A HREF=\"(\S+)\">subject.txt<\/A><\/TD>/ ).each {|u| if cache[url+'kako/'+u[0]]!=1 cache[url+'kako/'+u[0]]=1 ufile=url[7..-2].tr('/','_') subject=File.open("kako/#{ufile}","a") subject.print getfile(url+'kako/'+u[0]) subject.close() end } end cachefile=File.open('./kako/cache_subject.txt','w') cache.each{|url,value| cachefile.puts url} cachefile.close()