2

我正在尝试将 xml 转储从 trac db 转换为 mediawiki 格式。我正在使用以下 perl 脚本来执行转换:

#!/usr/bin/perl
# trac2mw
# Converts a trac wiki to MediaWiki format
# Input:
#   MySQL XML export of the Trac database
# Output:
#   MediaWiki XML, suitable for feeding to Special:Import
#
# by Matthew Sachs <matthewg@zevils.com>
#
#This work is hereby released into the Public Domain. To view a copy
#of the public domain dedication, visit:
#  http://creativecommons.org/licenses/publicdomain/

# Name of your MediaWiki
use constant WIKI_NAME => "MyWiki";

# Front page of your MediaWiki
use constant WIKI_BASE => "http://example.com/Wiki/Main_Page";

# To whitelist pages, list them here, one per line, e.g.
#   @includePages = qw(
#   FooBar
#   FooBaz
#   );
#
# If any pages are listed, only pages on the list will be included.
my @includePages = qw(
);

# Pages can be renamed.
# Trac page name => MediaWiki page name
my %renamePages = (WikiStart => "Main Page");

# Trac -> MediaWiki author map.
# TracUser => [MediaWiki username, MediaWiki user ID]
# Example:
#   my %authors = (
#     matthew => ["Matthew", 1],
#     liz => ["Liz", 2]
#   );
my %authors = ();

# END CONFIGURATION


use strict;
use warnings;
use XML::LibXML;
use POSIX qw(strftime);

our $VERSION = "0.1";

sub fixWikiList {
  my($in) = @_;

  my $ret = "";
  my @rows = split(/\n/, $in);
  my @inlists = ();

  foreach my $row (@rows) {
    $row =~ s/^ ( *)([#*])//;
    my($space, $toplist) = ($1, $2);

    while(length($space) < @inlists) {
      pop @inlists;
    }
    push @inlists, $toplist;

    $ret .= join("", @inlists) . $row . "\n";
  }

  return $ret;
}

sub fixWikiTable {
  my($in) = @_;
  my $ret = "{|\n";
  my @rows = split(/\n/, $in);

  foreach my $row (@rows) {
    $ret .= "|-\n";
    $row =~ s/^\|\|//;
    $row =~ s/\|\|$//;
    foreach my $col (split(/\|\|/, $row)) {
      $ret .= "| $col\n";
    }
  }

  $ret =~ s/\n\| -$/\n/;
  $ret .= "|}\n";
  return $ret;
}

sub fixWikiText {
  my($text) = @_;

  # [wiki:Foo] -> [Foo]
  $text =~ s/\[wiki:/[/g;

  # [Link] -> [[Link]]
  $text =~ s/\[([a-z0-9 _-]+)\]/[[$1]]/ig;

  # <sp><sp>* -> **
  $text =~ s/((?:^(?: +)(?:[*#])(?:.*)$(?:\n?))+)/fixWikiList($1)/gem;

  # CamelCase -> [[CamelCase]]
  $text =~ s^ ([A-Z][a-z0-9]+[A-Z][a-z0-9]+[a-zA-Z0-9]*)\b^ [[$1]]^g;

  # Table syntax
  $text =~ s/((?:^\|\|.*\|\|$(?:\n?))+)/fixWikiTable($1)/gem;

  return $text;
}


my $parser = XML::LibXML->new();
my $doc = $parser->parse_file(shift);
my $root = $doc->documentElement();

my @wikiPages = $root->findnodes("//table_data[\@name='wiki']/row");
my %pages;
foreach my $node (@wikiPages) {
  my $name = $node->find("field[\@name='name']")->[0]->textContent();
  next unless grep { $_ eq $name } @includePages;

  $name = $renamePages{$name} if $renamePages{$name};

  my $text = $node->find("field[\@name='text']")->[0]->textContent();
  my $author = $node->find("field[\@name='author']")->[0]->textContent();
  my $time = $node->find("field[\@name='time']")->[0]->textContent();
  my $version = $node->find("field[\@name='version']")->[0]->textContent();
  $version--;
  $pages{$name} ||= [];
  $pages{$name}->[$version] = {
                               text => $text,
                               author => $author,
                               time => $time,
                              };
}

my $outDoc = XML::LibXML::Document->new();
my $mw = XML::LibXML::Element->new("mediawiki");
$outDoc->setDocumentElement($mw);

use constant MW_NS => "http://www.mediawiki.org/xml/export-0.3/";
use constant XSI_NS => "http://www.w3.org/2001/XMLSchema-instance";

$mw->setNamespace(XSI_NS, "xsi");
$mw->setNamespace(MW_NS);
$mw->setAttributeNS(XSI_NS, "schemaLocation", "http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd");
$mw->setAttributeNS(MW_NS, "version", "0.3");
$mw->setAttribute("xml:lang", "en");

my $siteinfo = XML::LibXML::Element->new("siteinfo");
$mw->addChild($siteinfo);
$siteinfo->appendTextChild("sitename", WIKI_NAME);
$siteinfo->appendTextChild("base", WIKI_BASE);
$siteinfo->appendTextChild("generator", "trac2mw $VERSION");
$siteinfo->appendTextChild("case", "first-letter");

my $namespaces = XML::LibXML::Element->new("namespaces");
$siteinfo->addChild($namespaces);
foreach my $ns ([-2, "Media"], [-1, "Special"],
                [0, ""], [1, "Talk"],
                [2, "User"],
                [3, "User talk"],
                [4, WIKI_NAME],
                [5, WIKI_NAME . " talk"],
                [6, "Image"],
                [7, "Image talk"],
                [8, "MediaWiki"],
                [9, "MediaWiki talk"],
                [10, "Template"],
                [11, "Template talk"],
                [12, "Help"],
                [13, "Help talk"],
                [14, "Category"],
                [15, "Category talk"],) {

  my $nsElement = XML::LibXML::Element->new("namespace");
  $namespaces->addChild($nsElement);
  $nsElement->setAttribute("key", $ns->[0]);
  $nsElement->appendText($ns->[1]);
}

my $pageID = 1000;
while(my($page, $revisions) = each(%pages)) {
  my $pageNode = XML::LibXML::Element->new("page");
  $mw->addChild($pageNode);
  $pageNode->appendTextChild("title", $page);
  $pageNode->appendTextChild("id", $pageID);

  my $revID = 1;
  foreach my $revision(@$revisions) {
    my($text, $author, $time) = ($revision->{text},
                                 $revision->{author},
                                 $revision->{time});

    $text = fixWikiText($text);

    my $revision = XML::LibXML::Element->new("revision");
    $pageNode->addChild($revision);
    $revision->appendTextChild("id", $pageID + $revID);
    $revision->appendTextChild("timestamp",
                               strftime("%Y-%m-%dT%H:%M:%SZ",
                                        gmtime($time)));

    my $contributor = XML::LibXML::Element->new("contributor");
    $revision->addChild($contributor);
    my($user_id) = $authors{$author};
    if($user_id) {
      $contributor->appendTextChild("username", $user_id->[0]);
      $contributor->appendTextChild("id", $user_id->[1]);
    }

    my $textNode = XML::LibXML::Element->new("text");
    $revision->addChild($textNode);
    $textNode->setAttribute("xml:space", "preserve");
    $textNode->appendText($text);
  } continue {
    $revID++;
  }
} continue {
  $pageID += 1000;
}

$outDoc->toFH(\*STDOUT, 1);

我的 xml 文件以:

<?xml version="1.0" encoding="utf-8"?>
<!--
- phpMyAdmin XML Dump
- version 3.3.7deb7
- http://www.phpmyadmin.net
-
- Host: localhost
- Generation Time: Jul 23, 2012 at 04:21 PM
- Server version: 5.1.61
- PHP Version: 5.3.3-7+squeeze13
-->
<pma_xml_export version="1.0">
<!--
- Database: 'trac'
-->
<database name="trac">
    <!-- Table wiki -->
    <table name="wiki">
        <column name="name">TracAdmin</column>
        <column name="version">1</column>
        <column name="time">1205745178</column>
        <column name="author">trac</column>
        <column name="ipnr">127.0.0.1</column>
        <column name="text">= TracAdmin =
       [[TracGuideToc]]
   ...

当我开始转换时,我收到以下错误:

# perl trac2mw.pl tracxmldump.xml
bad ns attribute! at /usr/lib/perl5/XML/LibXML.pm line 1574.

这与脚本或 libxml 有关吗?我该如何解决?

4

0 回答 0