PlaggerでMeCabを使う
Plaggerをインストール。
Plagger::Plugin::Publish::CSVを参考に、MeCabの解析結果をCSV保存するプラグインを作ってみた。
MeCab.pm
package Plagger::Plugin::Publish::MeCab; use strict; use warnings; use base qw ( Plagger::Plugin ); our $VERSION = 0.01; use Encode; use MeCab; use File::Spec; use Text::CSV_PP; use IO::File; sub register { my ($self, $context) = @_; $context->register_hook( $self, 'publish.feed' => \&feed, ); } sub feed { my ($self, $context, $args) = @_; my $mecab = MeCab::Tagger->new(); my $csv = Text::CSV_PP->new({ binary => 1, sep_char => "\t" }); my $append = ($self->conf->{mode} && $self->conf->{mode} eq 'append'); my $dir = $self->conf->{dir}; unless (-e $dir && -d _) { mkdir $dir, 0755 or $context->error("mkdir $dir: $!"); } my $file = Plagger::Util::filename_for($args->{feed}, $self->conf->{filename} || "%u.csv"); my $path = File::Spec->catfile($dir, $file); my $io = IO::File->new($append ? ">> $path" : "> $path"); my %tf=(); for my $entry ($args->{feed}->entries) { foreach my $sentence (split(/\n"/,$entry->{title}), split(/\n"/,$entry->{body})){ my $node = $mecab->parseToNode($sentence); for(; $node; $node =$node->{next}){ my $midasi = $node->{surface}; my($hinsi, $hinsi2) = (split(/,/, $node->{feature}))[0,1]; # print $midasi,"\t",$hinsi,"\t",$hinsi2,"\n"; if($hinsi eq '名詞' and ($hinsi2 eq '固有名詞' or $hinsi2 eq '一般')){ $tf{$midasi}++; } } } } foreach my $w (keys %tf) { my $st = $csv->combine($w, $tf{$w}); $io->printf("%s\n", $self->convert($csv->string)) if $st; # print $self->convert($csv->string),"\n" if $st; } } sub convert { my ($self, $str) = @_; utf8::decode($str) unless utf8::is_utf8($str); return encode($self->conf->{encoding} || 'utf8', $str); } 1; __END__
rss2mecab.yaml
global: plugin_path: - /Library/Perl/5.8.6/Plagger/Plugin/ - /Users/tolerance/ assets_path: /Library/Perl/5.8.6/Plagger/assets/ plugins: - module: Subscription::Config config: feed: - http://wiredvision.jp/news/atom.xml - http://japan.zdnet.com/rss/news/index.rdf - module: Publish::MeCab config: dir: /Users/tolerance/mecabout filename: %u.csv