MyBB Community Forums

Full Version: [B] UTF-8 conversion screws up postings with umlauts [R]
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
If you convert your tables from latin to utf8 all contents with umlauts get screwed up.

Steps to reproduce:
1. Set up a new board and choose "ISO 8859-2 Central European" as database encoding. Make sure you also set "iso-8859-2" as the encoding in the file inc/languages/*language*.php

2. Write a new post with the following content:
Quote:This is just a test. AƤ test test test.

3. Change the encoding settings in inc/config.php and inc/languages/*language*.php to utf8.

4. Convert all tables to utf-8.

After that the post looks like this:
Quote:This is just a test. A
It ends where the umlaut should be.
Not sure what to tell you. We do what the MySQL docs say to do.
How did you do step 4 - convert all tables to UTF-8?
(2008-11-14, 08:40 AM)frostschutz Wrote: [ -> ]How did you do step 4 - convert all tables to UTF-8?

This is the snippet of code that actually does the processing from admin/modules/tools/system_health.php

$old_table_prefix = $db->table_prefix;
		$db->set_table_prefix('');
		
		$table = new Table;
		
		$table1 = $db->show_create_table($db->escape_string($mybb->input['table']));
        preg_match("#CHARSET=([a-zA-Z0-9_]+)\s?#i", $table1, $matches);
		$charset = $matches[1];
		
		$table->construct_cell("<strong>".$lang->sprintf($lang->converting_to_utf8, $mybb->input['table'], $charset)."</strong>");
		$table->construct_row();
		
		$table->construct_cell($lang->please_wait);
		$table->construct_row();
		
		$table->output($converting_table." {$mybb->input['table']}");
		
		$db->set_table_prefix($old_table_prefix);
		
		$page->output_footer(false);
		
		$old_table_prefix = $db->table_prefix;
		$db->set_table_prefix('');
		
		flush();

		$types = array(
			'text' => 'blob',
			'mediumtext' => 'mediumblob',
			'longtext' => 'longblob',
			'char' => 'varbinary',
			'varchar' => 'varbinary',
			'tinytext' => 'tinyblob'			
		);
		
		// Get next table in list
		$convert_to_binary = '';
		$convert_to_utf8 = '';
		$comma = '';
		
		// Set table default charset
		$db->write_query("ALTER TABLE {$mybb->input['table']} DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci");

		// Fetch any fulltext keys
		if($db->supports_fulltext($mybb->input['table']))
		{
			$table_structure = $db->show_create_table($mybb->input['table']);
			switch($db->type)
			{
				case "mysql":
				case "mysqli":
					preg_match_all("#FULLTEXT KEY `?([a-zA-Z0-9_]+)`? \(([a-zA-Z0-9_`,']+)\)#i", $table_structure, $matches);
					if(is_array($matches))
					{
						foreach($matches[0] as $key => $matched)
						{
							$db->write_query("ALTER TABLE {$mybb->input['table']} DROP INDEX {$matches[1][$key]}");
							$fulltext_to_create[$matches[1][$key]] = $matches[2][$key];
						}
					}
			}
		}

		// Find out which columns need converting and build SQL statements
		$query = $db->query("SHOW FULL COLUMNS FROM {$mybb->input['table']}");
		while($column = $db->fetch_array($query))
		{
			list($type) = explode('(', $column['Type']);
			if(array_key_exists($type, $types))
			{
				// Build the actual strings for converting the columns
				$names = "CHANGE `{$column['Field']}` `{$column['Field']}` ";
				
				$attributes = " DEFAULT ";
				if($column['Default'] == 'NULL')
				{
					$attributes .= "NULL ";
				}
				else
				{
					$attributes .= "'".$db->escape_string($column['Default'])."' ";
					
					if($column['Null'] == 'YES')
					{
						$attributes .= 'NULL';
					}
					else
					{
						$attributes .= 'NOT NULL';
					}
				}
				
				$convert_to_binary .= $comma.$names.preg_replace('/'.$type.'/i', $types[$type], $column['Type']).$attributes;
				$convert_to_utf8 .= "{$comma}{$names}{$column['Type']} CHARACTER SET utf8 COLLATE utf8_general_ci{$attributes}";
				
				$comma = ', ';
			}
		}
		
		if(!empty($convert_to_binary))
		{
			// This converts the columns to UTF-8 while also doing the same for data
			$db->write_query("ALTER TABLE {$mybb->input['table']} {$convert_to_binary}");
			$db->write_query("ALTER TABLE {$mybb->input['table']} {$convert_to_utf8}");
		}
		
		// Any fulltext indexes to recreate?
		if(is_array($fulltext_to_create))
		{
			foreach($fulltext_to_create as $name => $fields)
			{
				$db->create_fulltext_index($mybb->input['table'], $fields, $name);
			}
		}
		
		$db->set_table_prefix($old_table_prefix);
		
		$plugins->run_hooks("admin_tools_system_health_utf8_conversion_commit");
		
		// Log admin action
		log_admin_action($mybb->input['table']);
		
		flash_message($lang->sprintf($lang->success_table_converted, $mybb->input['table']), 'success');
As far as I'm concerned this isn't a problem in MyBB. We run the process that is supposed to convert it properly, but it's up to MySQL to actually convert it to binary then convert it to utf8 properly. That's the process MySQL tells us developers to do: http://dev.mysql.com/doc/refman/5.0/en/alter-table.html

Quote:Warning

The CONVERT TO operation converts column values between the character sets. This is not what you want if you have a column in one character set (like latin1) but the stored values actually use some other, incompatible character set (like utf8). In this case, you have to do the following for each such column:

ALTER TABLE t1 CHANGE c1 c1 BLOB;
ALTER TABLE t1 CHANGE c1 c1 TEXT CHARACTER SET utf8;

The reason this works is that there is no conversion when you convert to or from BLOB columns.

If there's a problem with it, then they need to fix it.

Unless anyone has some brighter ideas, I'm marking this as bogus.