Edict SQL generator sourcecode

Tagged code, php, programming, sql
This is the source code for the script used to generate the Edict SQL file located here.
<?php
/*
 *  Edict to MySQL converter v3
 *  Kyle Hasegawa
 *  "http://kylehasegawa.com/edict-sql.php"
 *  2010-01-28
*/


  // Global file locations
$edict_sql_file    = sys_get_temp_dir() . "/edict.sql.gz";
$local_edict_file  = sys_get_temp_dir() . "/edict.gz";
$remote_edict_file = "http://ftp.monash.edu.au/pub/nihongo/edict.gz";

  // If the file doesn't exist or it's old create/update it.
if (! file_exists($edict_sql_file) || (time() - filemtime($edict_sql_file) > 60*60*24*10)) {
  update_edict_sql_file();
}
  // Serve the edict file
serve_edict_sql_file();

/*
 * serve_edict_sql_file()
 * Serves the gzip file via PHP by setting header data
*/

function serve_edict_sql_file(){
  GLOBAL $edict_sql_file;
  $size = filesize($edict_sql_file);
  if(! $size || $size < 2*1024*1024) {
    header('HTTP/1.0 404 Not Found');
    exit('Error: file temporarily unavailable');
  }
  header('Content-Description: File Transfer');
  header('Content-Type: application/x-gzip');
  header('Content-Disposition: attachment; filename='.basename($edict_sql_file));
  header('Content-Transfer-Encoding: binary');
  header('Expires: 0');
  header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
  header('Pragma: public');
  header("Content-Length: $size");
  ob_clean();
  flush();
  readfile($edict_sql_file);
}


/*
 * download_edict_file()
 * Downloads an updated edict file from Monash Univ.
 */

function download_edict_file() {
  GLOBAL $local_edict_file;
  GLOBAL $remote_edict_file;

  $remote_file_data = file_get_contents($remote_edict_file);
  if(! $remote_file_data) return FALSE;

  $local_edict_handle = fopen($local_edict_file, "w");
  if(! $local_edict_handle) return FALSE;

  fwrite($local_edict_handle, $remote_file_data);
  fclose($local_edict_handle);

  unset($remote_file_data);
  return TRUE;
}


/*
 * update_edict_sql_file()
 * Updates the edict sql file
 */

function update_edict_sql_file(){
  GLOBAL $local_edict_file;
  GLOBAL $edict_sql_file;

    // I have multiple scripts using this file so the local file may or may not be old
  if (! file_exists($local_edict_file) || (time() - filemtime($local_edict_file) > 60*60*24*10)) {
      // If the local file can't be downloaded return.
    if(! download_edict_file()) return FALSE;
  }
    // Open the local file for reading
  $local_edict_handle = gzopen($local_edict_file, "r");
  if(! $local_edict_handle) return FALSE;

    // Open the edict sql file for writing
  $sql_format = gzopen($edict_sql_file, "w9");
  if(! $sql_format) return FALSE;


  $timestamp = date("F d Y", time());

    // Create the header of the SQL file
  $header = <<<HEADER
-- MySQL dump 10.10
--
-- Host: localhost    Database: edict
-- Created from: EDICT, EDICT_SUB(P), EDICTEXT Japanese-English Electronic Dictionary Files/Copyright Electronic Dictionary Research & Development Group - 2006/
-- Created: date $timestamp
-- MySQL conversion by Kyle Hasegawa
-- ------------------------------------------------------
-- Server version       5.0.22

/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8 */;
/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
/*!40103 SET TIME_ZONE='+00:00' */;
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;

--
-- Table structure for table `edict`
--

DROP TABLE IF EXISTS `edict`;
CREATE TABLE `edict` (
  `japanese` varchar(80) default NULL,
  `yomigana` varchar(80) default NULL,
  `english` text
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='edict';

--
-- Dumping data for table `edict`
--


/*!40000 ALTER TABLE `edict` DISABLE KEYS */;
LOCK TABLES `edict` WRITE;
INSERT INTO `edict` VALUES
HEADER;

    // create an insert line which will be used to break the insert into smaller pieces or else we'll get mysql max_allowed_packet errors.
  $insert = ";\nINSERT INTO `edict` VALUES ";

    // create the footer which completes the insert
  $footer = <<<FOOTER
;
UNLOCK TABLES;
/*!40000 ALTER TABLE `edict` ENABLE KEYS */;
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
FOOTER;

    // start the sql string with the header and concatenate as we go on
  $sql = $header;
    // keep track of the line number being processed
  $linenum = 0;
    // loop through the edict file
  while (!feof($local_edict_handle)) {
    $buffer = fgets($local_edict_handle, 4096);
      // the first line of the edict file is meta data so we'll skip it
    if ($linenum == 0){
      $linenum++;
      continue;
    }
      // due to MySQL import restrictions we can't insert the whole edict at once.  Breaking up into 1000s
      // concatenating the $insert line will start a new insert command
    if(($linenum % 1000) == 0) $sql .= $insert;
    elseif($linenum > 1) $sql .= ",\n";
      // Parse each line of the edict file
      // Convert to UTF8, strip traling \n, <space>, and / and escape quotes for SQL
    $line = addslashes(rtrim(mb_convert_encoding($buffer,"UTF-8","EUC-JP"),"/ \n"));
    $fields = explode(' ',$line);
      // First field is always Japanese, sometimes yomigana
    $japanese = $yomigana = array_shift($fields);
      // If the second field (now first field) has square brackets, it's yomigana
    if(strpos($fields[0],'[') !== FALSE) {
        $yomigana = trim(array_shift($fields), '[]');
    }
      // The rest is english. Re-combine the remaining fields as a string
    $english = trim(implode(' ',$fields), '/');
    $sql .= "('$japanese','$yomigana','$english')";
    $linenum++;
  }// end while loop
    // close the edict file
  gzclose($local_edict_handle);
    // append the SQL closing lines
  $sql .= $footer;
    // write the new file
  gzwrite($sql_format, $sql);
  gzclose($sql_format);
  return TRUE;
}
?>

All code on this site is free for use at your own risk and provided as-is under the WTFPL license unless otherwise stated. Attribution is appreciated but not required.
Blog content, with the exception of externally quoted material, is licensed under the Creative Commons Attribution 3.0 license