技術日記
tDiaryからMT形式で出力するスクリプトEdit

tDiaryからWordPressにデータをインポートしようとして、

とかを試してみたんだけど、うまくいかなかったんで適当にでっち上げた。コメントとかトラックバックには対応していないし、内部リンクとかの解決もしてない。なぜか画像へのリンクだけは適当に変換かけてある。

HTTPでがしがしアクセスしまくるんで、共有サーバーとかでそのまま実行しちゃだめ。自力で安全に使用できそうな人は参考にどうぞ。

#!/usr/bin/php
<?php

define('DEBUG', false);
mb_internal_encoding('utf-8');

$author = 'ishinao';
$baseUrl = 'http://tdiary.ishinao.net/';
$dailyPattern = 'Ymd.\h\t\m\l';
$sourceImagePath = 'images/';
$destImagePath = '/wp-content/';

$startDate = '2009-10-29';
$nextDate = $startDate;

while ($nextDate = processDay($nextDate)) {
}

function processDay($date)
{
 global $baseUrl, $dailyPatten;
 $url = getDailyUrl($date);
 debug($url);
 $html = file_get_contents($url);

 $body = getBodyHtml($html);
 //debug($body);

 $sections = splitSections($body);
 $sectionInfo = array();
 foreach ($sections as $section) {
 //debug($section);

 $sectionInfo[] = array(
 'html' => $section,
 'body' => cleanupSection($section),
 'title' => getTitle($section),
 'category' => getCategory($section),
 );
 }
 foreach ($sectionInfo as $section) {
 printSection($date, $section);
 }

 $nextDate = getPrevLink($html);
 debug($nextDate);

 return $nextDate;
}

function printSection($date, $section)
{
 global $author;
 $title = $section['title'];
 $date = date('m/d/Y h:i:s A', strtotime($date));
 $body = $section['body'];
 $categories = $section['category'];
 echo <<<EOD
AUTHOR: $author
TITLE: $title
STATUS: Draft
ALLOW COMMENTS: 0
CONVERT BREAKS: __default__
ALLOW PINGS: 0

EOD;
 foreach ($categories as $category) {
 echo 'CATEGORY: ' . $category . PHP_EOL;
 }
 if (!empty($categories)) {
 echo 'PRIMARY CATEGORY: ' . $categories[0] . PHP_EOL;
 }

 echo <<<EOD
DATE: $date
-----
BODY:
$body

-----
EXTENDED BODY:

-----
EXCERPT:

-----
KEYWORDS:

-----

--------

EOD;
}

function cleanupSection($html)
{
 global $baseUrl, $sourceImagePath, $destImagePath;

 $result = trim($html);
 $result = preg_replace('#^<div>#', '', $result);
 $result = preg_replace('#</div>$#', '', $result);
 $result = preg_replace('#<h3>.+?</h3>#', '', $result);
 $result = preg_replace('#<a name="p\d+".+?</a>#', '', $result);
 $result = preg_replace('/(' . preg_quote($baseUrl, '/') . ')?' . preg_quote($sourceImagePath, '/') . '/', $destImagePath, $result);
 return trim($result);
}

function getCategory($html)
{
 $result = array();
 if (preg_match('#<h3>(.+?)</h3>#', $html, $matches)) {
 $title = preg_replace('#<span>.+?</span>#', '', $matches[1]);
 $title = trim(strip_tags($title));
 if (preg_match_all('#(\[[^\]]+\])#', $title, $matches)) {
 foreach ($matches[1] as $match) {
 $category = trim($match, '[] ');
 $result[] = $category;
 }
 }
 }
 array_unique($result);
 return $result;
}

function getTitle($html)
{
 if (preg_match('#<h3>(.+?)</h3>#', $html, $matches)) {
 $title = preg_replace('#<span>.+?</span>#', '', $matches[1]);
 $title = trim(strip_tags($title));
 $title = preg_replace('#^(\[[^\]]+\]\s*)+#', '', $title);
 return $title;
 } else if (preg_match('#title="(.+?)"#', $html, $matches)) {
 return $matches[1];
 }
}

function splitSections($html)
{
 $result = array();
 $splitter = '<div>';
 $splits = explode($splitter, $html);
 if (count($splits) > 1) {
 array_shift($splits);
 $splits[count($splits) - 1] = preg_replace('#</div>\s*$#', '', $splits[count($splits) - 1]);
 }
 foreach ($splits as $split) {
 $split = trim($split);
 if ($split == '') {continue;}
 if (trim(strip_tags($split)) == '') {continue;}
 $result[] = $splitter . $split;
 }
 return $result;
}

function getBodyHtml($html)
{
 if (preg_match('#(<div>.+</div>\s*)<div>#ims', $html, $matches)) {
 return $matches[1];
 }
}

function getPrevLink($html)
{
 if (preg_match('#<link rel="prev".+?href="(.+?)".*?>#', $html, $matches)) {
 $link = $matches[1];
 if (preg_match('#(\d{4})(\d{2})(\d{2})#', $link, $matches)) {
 return $matches[1] . '-' . $matches[2] . '-' . $matches[3];
 }
 }
 return null;
}

function getDailyUrl($date)
{
 global $baseUrl, $dailyPattern;
 return $baseUrl . date($dailyPattern, strtotime($date));
}

function debug($string)
{
 if (DEBUG) {echo $string . PHP_EOL . str_repeat('-', 50) . PHP_EOL;}
}

Published At2010-05-07 15:15Updated At2019-12-30 15:35