# # -*- Perl -*- # $Id: msword.pl,v 1.28.4.26 2005/10/07 03:28:56 opengl2772 Exp $ # Copyright (C) 1997-2000 Satoru Takabayashi, # 2000-2005 Namazu Project All rights reserved. # This is free software with ABSOLUTELY NO WARRANTY. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either versions 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA # 02111-1307, USA # # This file must be encoded in EUC-JP encoding # package msword; use strict; use English; use File::Basename; require 'util.pl'; require 'gfilter.pl'; require 'html.pl'; eval 'require NKF;'; my $wordconvpath = undef; my @wordconvopts = undef; my $wvversionpath = undef; my $utfconvpath = undef; my $convname = undef; my $wvsummarypath = undef; my $nkfversion = 0.00; eval '$nkfversion = $NKF::VERSION;'; $nkfversion = 0.00 if (!defined $nkfversion); sub mediatype() { return ('application/msword'); } sub status() { # The check of a dependence filter. return 'no' if (html::status() ne 'yes'); $wordconvpath = util::checkcmd('antiword'); @wordconvopts = ("-s"); return 'yes' if defined $wordconvpath; $wvsummarypath = util::checkcmd('wvSummary'); $wvversionpath = util::checkcmd('wvVersion'); $wordconvpath = util::checkcmd('wvWare'); if (defined $wordconvpath) { my @cmd = ($wordconvpath, "--version"); my $fh_out = IO::File->new_tmpfile(); my $status = util::syscmd( command => \@cmd, option => { "stdout" => $fh_out, "stderr" => "/dev/null", }, ); my $version = util::readfile($fh_out); codeconv::normalize_document(\$version); util::fclose($fh_out); if ($version =~ s/wvWare (\d\.\d)\.(\d).*/$1$2/) { if ($version >= 0.75) { if (!util::islang("ja")) { return 'yes'; } else { $utfconvpath = util::checkcmd('lv'); if (defined $wvversionpath && (defined $utfconvpath || $English::PERL_VERSION >= 5.008 || $nkfversion >= 2.04)) { return 'yes'; } } } } } $wordconvpath = util::checkcmd('wvHtml'); if (defined $wordconvpath) { if (!util::islang("ja")) { return 'yes'; } else { $utfconvpath = util::checkcmd('lv'); if (defined $wvversionpath && (defined $utfconvpath || $English::PERL_VERSION >= 5.008 || $nkfversion >= 2.04)) { return 'yes'; } } } $wordconvpath = util::checkcmd('doccat'); @wordconvopts = ("-o", "e"); return 'yes' if defined $wordconvpath; return 'no'; } sub recursive() { return 0; } sub pre_codeconv() { return 0; } sub post_codeconv () { return 0; } sub add_magic ($) { return; } sub filter ($$$$$) { my ($orig_cfile, $cont, $weighted_str, $headings, $fields) = @_; my $err = undef; $convname = basename($wordconvpath) unless (defined $convname); if ($convname =~ /(?:wvWare|wvHtml)/i) { $err = filter_wv($orig_cfile, $cont, $weighted_str, $headings, $fields); } else { $err = filter_doccat($orig_cfile, $cont, $weighted_str, $headings, $fields); } return $err; } sub filter_wv ($$$$$) { my ($orig_cfile, $cont, $weighted_str, $headings, $fields) = @_; my $cfile = defined $orig_cfile ? $$orig_cfile : ''; my $err = undef; util::vprint("Processing ms-word file ... (using '$wordconvpath')\n"); my $tmpfile = util::tmpnam('NMZ.word'); { my $fh = util::efopen("> $tmpfile"); print $fh $$cont; util::fclose($fh); } # Check version of word document (greater than word7,8 or else). if (util::islang("ja")) { my $docversion = getDocumentVersion($tmpfile); if ($docversion !~ /^word[78]$/) { unlink $tmpfile; return _("Unsupported format: ") . $docversion; } } # get summary info in all OLE documents. getSummaryInfo($tmpfile, $cont, $weighted_str, $headings, $fields); my $title = $fields->{'title'}; if ($convname =~ /wvWare/i) { $err = filter_wvWare($tmpfile, $cont, $weighted_str, $headings, $fields); } else { $err = filter_wvHtml($tmpfile, $cont, $weighted_str, $headings, $fields); } unlink $tmpfile; return $err if (defined $err); # Exclude wvHtml's footer because it has no good index terms. $$cont =~ s/(.*).*$/$1/s; html::html_filter($cont, $weighted_str, $fields, $headings); $fields->{'title'} = $title if (defined $title); gfilter::line_adjust_filter($cont); gfilter::line_adjust_filter($weighted_str); gfilter::white_space_adjust_filter($cont); $fields->{'title'} = gfilter::filename_to_title($cfile, $weighted_str) unless $fields->{'title'}; gfilter::show_filter_debug_info($cont, $weighted_str, $fields, $headings); return undef; } sub filter_wvWare ($$$$$) { my ($cfile, $cont, $weighted_str, $headings, $fields) = @_; my $tmpfile2 = util::tmpnam('NMZ.word2'); my ($ofile, $tpath) = ("", ""); ($ofile, $tpath) = fileparse($tmpfile2); my $name = basename($ofile); if (util::islang("ja")) { @wordconvopts = ( "--nographics", "-d", "$tpath", "-b", "$name", "--charset=EUC-JP" ); } else { @wordconvopts = ( "--nographics", "-d", "$tpath", "-b", "$name" ); } my @cmd = ($wordconvpath, @wordconvopts, $cfile); my $fh_out = IO::File->new_tmpfile(); my $status = util::syscmd( command => \@cmd, option => { "stdout" => $fh_out, "stderr" => "/dev/null", }, ); my $size = util::filesize($fh_out); if ($size == 0) { util::fclose($fh_out); unlink($tmpfile2); return "Unable to convert file ($wordconvpath error occurred)."; } if ($size > $conf::FILE_SIZE_MAX) { util::fclose($fh_out); unlink($tmpfile2); return 'Too large word file.'; } $$cont = util::readfile($fh_out); codeconv::normalize_document($cont); util::fclose($fh_out); unlink($tmpfile2); if (util::islang("ja")) { # Title shoud be removed. $$cont =~ s!.*?!!is; # div name shoud be removed. $$cont =~ s!(]*))?)*)\s+name=(?:"(?:\\"|[^"])*"|'(?:\\'|[^'])*'|[^\s>]*)(\s[A-Z]+\w*(?:=(?:"(?:\\"|[^"])*"|'(?:\\'|[^'])*'|[^\s>]*))?\s*>)!$1$2!igs; codeconv::normalize_eucjp($cont); } return undef; } sub filter_wvHtml ($$$$$) { my ($cfile, $cont, $weighted_str, $headings, $fields) = @_; # Check version of wvWare (greater than 0.7 or else). my $tmpfile2 = util::tmpnam('NMZ.word2'); my ($ofile, $tpath) = ("", ""); { my @cmd = ($wordconvpath, "--version"); my $fh_out = IO::File->new_tmpfile(); my $status = util::syscmd( command => \@cmd, option => { "stdout" => $fh_out, "stderr" => "/dev/null", }, ); my $result = util::readfile($fh_out); codeconv::normalize_document(\$result); util::fclose($fh_out); if ($result ne "" and $result !~ /usage/i and $result ge "0.7") { ($ofile, $tpath) = fileparse($tmpfile2); @wordconvopts = ("--targetdir=$tpath"); } else { if (util::islang("ja")) { my $docversion = getDocumentVersion($cfile); return _("Unsupported format: ") . $docversion if ($docversion =~ /^word7$/i); } $ofile = $tmpfile2; @wordconvopts = (); } } my @cmd = ($wordconvpath, @wordconvopts, $cfile, $ofile); my $status = util::syscmd( command => \@cmd, option => { "stdout" => "/dev/null", "stderr" => "/dev/null", }, ); unless (-e $tmpfile2) { return "Unable to convert file ($wordconvpath error occurred)."; } my $size = util::filesize($tmpfile2); if ($size == 0) { unlink $tmpfile2; return "Unable to convert file ($wordconvpath error occurred)."; } if ($size > $conf::FILE_SIZE_MAX) { unlink $tmpfile2; return 'Too large word file.'; } { $$cont = util::readfile($tmpfile2); } unlink $tmpfile2; codeconv::normalize_document($cont); # Code conversion for Japanese document. if (util::islang("ja")) { # Title shoud be removed. $$cont =~ s!.*?!!is; # div name shoud be removed. $$cont =~ s!(]*))?)*)\s+name=(?:"(?:\\"|[^"])*"|'(?:\\'|[^'])*'|[^\s>]*)(\s[A-Z]+\w*(?:=(?:"(?:\\"|[^"])*"|'(?:\\'|[^'])*'|[^\s>]*))?\s*>)!$1$2!igs; utf8_to_eucjp($cont); } return undef; } sub filter_doccat ($$$$$) { my ($orig_cfile, $cont, $weighted_str, $headings, $fields) = @_; my $cfile = defined $orig_cfile ? $$orig_cfile : ''; util::vprint("Processing ms-word file ... (using '$wordconvpath')\n"); my $tmpfile = util::tmpnam('NMZ.word'); { my $fh = util::efopen("> $tmpfile"); print $fh $$cont; util::fclose($fh); } { my @cmd = ($wordconvpath, @wordconvopts, $tmpfile); my $fh_out = IO::File->new_tmpfile(); my $status = util::syscmd( command => \@cmd, option => { "stdout" => $fh_out, "stderr" => "/dev/null", }, ); my $size = util::filesize($fh_out); if ($size == 0) { util::fclose($fh_out); unlink $tmpfile; return "Unable to convert file ($wordconvpath error occurred)."; } if ($size > $conf::FILE_SIZE_MAX) { util::fclose($fh_out); unlink $tmpfile; return 'Too large word file.'; } $$cont = util::readfile($fh_out); util::fclose($fh_out); } unlink $tmpfile; codeconv::normalize_document($cont); gfilter::line_adjust_filter($cont); gfilter::line_adjust_filter($weighted_str); gfilter::white_space_adjust_filter($cont); $fields->{'title'} = gfilter::filename_to_title($cfile, $weighted_str) unless $fields->{'title'}; gfilter::show_filter_debug_info($cont, $weighted_str, $fields, $headings); return undef; } sub getDocumentVersion ($) { my ($cfile) = @_; my $docversion = "unknown"; # Check version of word document (greater than word7,8 or else). if (util::islang("ja")) { my @cmd = ($wvversionpath, $cfile); my $fh_out = IO::File->new_tmpfile(); my $status = util::syscmd( command => \@cmd, option => { "stdout" => $fh_out, "stderr" => "/dev/null", }, ); my $result = util::readfile($fh_out); codeconv::normalize_document(\$result); util::fclose($fh_out); if ($result =~ /^Version: (word\d+)(?:,| )/i) { $docversion = $1; } } return $docversion; } sub getSummaryInfo ($$$$$) { my ($cfile, $cont, $weighted_str, $headings, $fields) = @_; return undef unless (defined $wvsummarypath); my @cmd = ($wvsummarypath, $cfile); my $fh_out = IO::File->new_tmpfile(); my $status = util::syscmd( command => \@cmd, option => { "stdout" => $fh_out, "stderr" => "/dev/null", }, ); my $summary = util::readfile($fh_out); codeconv::normalize_document(\$summary); my $orgsummary = $summary; my $size = util::filesize($fh_out); util::fclose($fh_out); if ($size == 0) { return undef; } if ($size > $conf::FILE_SIZE_MAX) { return 'Too large summary file.'; } # Codepage # 932 : 0x000003a4 : Shift_JIS # 10001 : 0x00002711 : x-mac-japanese # 65001 : 0xfffffde9 : UTF-8 my $codepage = "000003a4"; # Shift_JIS my $title = undef; my $subject = undef; my $lastauthor = undef; my $author = undef; my $keywords = undef; if ($summary =~ /^Codepage is 0x([0-9a-f]*)/m) { $codepage = sprintf("%8.8x", hex($1)); } if ($codepage eq "fffffde9") { utf8_to_eucjp(\$summary); } else { # codeconv::toeuc(\$summary); codeconv::codeconv_document(\$summary); } if ($summary =~ /^The title is (.*)$/m) { my $orgtitle; $title = $1; # PowerPoint Only # $orgsummary =~ /^The title is (.*)$/m; # $orgtitle = $1; # undef $title if $orgtitle eq # which has no slide title # "\xbd\xd7\xb2\xc4\xde\x20\xc0\xb2\xc4\xd9\x82\xc8\x82\xb5"; } if ($summary =~ /^The subject is (.*)$/m) { $subject = $1; } if ($summary =~ /^The last author was (.*)$/m) { $lastauthor = $1; } if ($summary =~ /^The author is (.*)$/m) { $author = $1; } if ($summary =~ /^The keywords are (.*)$/m) { $keywords = $1; } my $weight = $conf::Weight{'html'}->{'title'}; if (defined $title) { $$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n"; } if (defined $subject) { $$weighted_str .= "\x7f$weight\x7f$subject\x7f/$weight\x7f\n"; } $fields->{'title'} = $title; $fields->{'title'} = $subject unless (defined $title); $fields->{'author'} = $lastauthor; $fields->{'author'} = $author unless (defined $lastauthor); if (defined $keywords) { $weight = $conf::Weight{'metakey'}; $$weighted_str .= "\x7f$weight\x7f$keywords\x7f/$weight\x7f\n"; } return undef; } sub utf8_to_eucjp($) { my ($cont) = @_; return undef unless (util::islang("ja")); if ($English::PERL_VERSION >= 5.008){ eval 'use Encode qw/from_to Unicode JP/;'; Encode::from_to($$cont, "utf-8" ,"euc-jp"); codeconv::normalize_eucjp($cont); return undef; } if ($nkfversion >= 2.04) { $$cont = NKF::nkf("-WemXZ1", $$cont); return undef; } return undef unless (defined $utfconvpath); my $tmpfile = util::tmpnam('NMZ.tmp.utf8'); { my $fh = util::efopen("> $tmpfile"); print $fh $$cont; util::fclose($fh); } my @cmd = ($utfconvpath, "-Iu8", "-Oej", $tmpfile); my $fh_out = IO::File->new_tmpfile(); my $status = util::syscmd( command => \@cmd, option => { "stdout" => $fh_out, "stderr" => "/dev/null", }, ); $$cont = util::readfile($fh_out); util::fclose($fh_out); codeconv::normalize_eucjp_document($cont); unlink $tmpfile; return undef; } 1;