rubyで2chクローラーを書いてみた(改訂)

キャッシュ機能追加

#!/usr/bin/ruby -KU -w

require 'net/http'
require 'kconv'

def getfile(url)
  content=''
  begin
    res = Net::HTTP.get_response(URI.parse(url))
    puts "#{res.code} #{url}"
    if res.code == '200'
      content=res.body.toutf8
    end
  rescue Timeout::Error
  rescue SocketError
  end 
  content
end

def scanhref(url,re)
  getfile(url).scan(re)
end

Dir.mkdir('./ima')  if ! FileTest.directory?('./ima') 
Dir.mkdir('./kako') if ! FileTest.directory?('./kako') 

cache={}
if FileTest.file?('./kako/cache_subject.txt')
  File.open('./kako/cache_subject.txt').each{|l| cache[l.chomp]=1}
end

scanhref(
  'http://menu.2ch.net/bbstable.html',
  /<A HREF=(http\S+).*?>(.+?)<\/A>/
).each do |url, title|

  puts title

  ufile=url[7..-2].tr('/','_')
  subject=File.open("ima/#{ufile}","w")

  scanhref(
    url+'subback.html',
    /<a href=\"(\S+)\".*?>\d+\:(.+?)<\/a>/
  ).each {|u,t| subject.print "#{u}<>#{t}\n"}

  subject.close()

  scanhref(
    url+'kako/',
    /<A HREF=\"(\S+)\">subject.txt<\/A><\/TD>/
  ).each {|u|
    if cache[url+'kako/'+u[0]]!=1
      cache[url+'kako/'+u[0]]=1
      ufile=url[7..-2].tr('/','_')
      subject=File.open("kako/#{ufile}","a")
      subject.print getfile(url+'kako/'+u[0])
      subject.close()
    end
  }
end

cachefile=File.open('./kako/cache_subject.txt','w')
cache.each{|url,value|  cachefile.puts url}
cachefile.close()