N-gram technique. Version: 0.2.2 Author: Joel Lee Author URI: http://blog.bcse.info License: GNU General Public License 2.0 */ /** * Build the n-gram table * @param string $target could be posts, comments or both * @param integer $N */ function bfs_BuildNgramTable($target='both', $N=2) { global $wpdb; if ( $target == 'posts' || $target == 'both' ) { //Truncate n-gram table $wpdb->query("TRUNCATE {$wpdb->prefix}posts_ngram"); //Get posts $posts = $wpdb->get_results("SELECT ID, post_title, post_content FROM $wpdb->posts"); //Build initial n-gram data foreach ( $posts as $post ) { $post_title_ngram = bfs_StringToNgram($post->post_title, $N, TRUE, ' '); $post_content_ngram = bfs_StringToNgram($post->post_content, $N, TRUE, ' '); $wpdb->query("INSERT DELAYED INTO {$wpdb->prefix}posts_ngram (ID, post_title_ngram, post_content_ngram) VALUES ('{$post->ID}', '{$post_title_ngram}', '{$post_content_ngram}')"); } } if ( $target == 'comments' || $target == 'both' ) { //Truncate n-gram table $wpdb->query("TRUNCATE {$wpdb->prefix}comments_ngram"); //Get posts $comments = $wpdb->get_results("SELECT comment_post_ID, comment_content FROM $wpdb->comments WHERE comment_type NOT IN ('pingback', 'trackback')"); //Build initial n-gram data foreach ( $comments as $comment ) { $concated_comments[$comment->comment_post_ID][] = $comment->comment_content; } //var_dump($concated_comments); foreach ( $concated_comments as $comment_post_ID => $comment_content_ngram ) { $concated_comment_content_ngram = bfs_StringToNgram(implode(' ', $comment_content_ngram), $N, TRUE, ' '); $wpdb->query("INSERT DELAYED INTO {$wpdb->prefix}comments_ngram (ID, comment_content_ngram) VALUES ('{$comment_post_ID}', '{$concated_comment_content_ngram}')"); } } } /** * Update or insert a n-gram record * @param integer $id post primary key (post_ID) * @param integer $N */ function bfs_UpdatePostNgramRow($id, $N=2) { global $wpdb; $post = $wpdb->get_row("SELECT post_title, post_content FROM $wpdb->posts WHERE ID = '{$id}'"); $post_title_ngram = bfs_StringToNgram($post->post_title, $N, TRUE, ' '); $post_content_ngram = bfs_StringToNgram($post->post_content, $N, TRUE, ' '); $wpdb->query("INSERT DELAYED INTO {$wpdb->prefix}posts_ngram (ID, post_title_ngram, post_content_ngram) VALUES ('{$id}', '{$post_title_ngram}', '{$post_content_ngram}') ON DUPLICATE KEY UPDATE post_title_ngram = '{$post_title_ngram}', post_content_ngram = '{$post_content_ngram}'"); } /** * Update or insert a n-gram record * @param integer $id comment primary key (comment_ID) * @param integer $N */ function bfs_UpdateCommentNgramRow($id, $N=2) { global $wpdb; $post_id = $wpdb->get_var("SELECT comment_post_ID FROM $wpdb->comments WHERE comment_ID = '{$id}' LIMIT 1"); $comments = $wpdb->get_col("SELECT comment_content FROM $wpdb->comments WHERE comment_post_ID = '{$post_id}'"); $comment_content_ngram = bfs_StringToNgram(implode(' ', $comments), $N, TRUE, ' '); $wpdb->query("INSERT DELAYED INTO {$wpdb->prefix}comments_ngram (ID, comment_content_ngram) VALUES ('{$post_id}', '{$comment_content_ngram}') ON DUPLICATE KEY UPDATE comment_content_ngram = '{$comment_content_ngram}'"); } /** * Delete a n-gram record * @param integer $id post primary key (post_ID) */ function bfs_DeleteNgramRow($id) { global $wpdb; $wpdb->query("DELETE FROM {$wpdb->prefix}posts_ngram WHERE ID = $id LIMIT 1"); $wpdb->query("DELETE FROM {$wpdb->prefix}comments_ngram WHERE ID = $id LIMIT 1"); } /** * Generate search query * @param string $query what user input in search box * @param integer $N * @return string SQL full-text search query */ function bfs_GetSearchQuery($criteria, $N=2) { global $wp_query, $wpdb, $wp_version; $include_comment = get_option('bcse_bfs_include_comment'); //Construct limit $current_page = ( !empty($wp_query->query_vars['paged']) ) ? $wp_query->query_vars['paged'] : 1; $limit_diff = get_settings('posts_per_page'); $limit = ( ( $current_page - 1 ) * $limit_diff); //Build ngram query $ngram_query = bfs_ParseOperators($criteria, $N); $calc_found_rows = ( !empty($wp_version) && version_compare($wp_version, '2.1', '>') ) ? 'SQL_CALC_FOUND_ROWS' : '' ; $query = "SELECT\n" . " {$calc_found_rows} {$wpdb->posts}.*,\n" . " (MATCH({$wpdb->prefix}posts_ngram.post_title_ngram)\n" . " AGAINST('{$ngram_query}' IN BOOLEAN MODE)*100) +\n" . " (MATCH({$wpdb->prefix}posts_ngram.post_content_ngram)\n" . " AGAINST('{$ngram_query}' IN BOOLEAN MODE)*50)"; if ( $include_comment ) { $query .= " +\n" . " (MATCH({$wpdb->prefix}comments_ngram.comment_content_ngram)\n" . " AGAINST('{$ngram_query}' IN BOOLEAN MODE)*10)"; } $query .= " AS score\n" . "FROM\n" . " {$wpdb->posts} INNER JOIN {$wpdb->prefix}posts_ngram USING (ID)"; if ( $include_comment ) { $query .= " LEFT JOIN {$wpdb->prefix}comments_ngram USING (ID)"; } $query .= "\nWHERE\n" . " (MATCH({$wpdb->prefix}posts_ngram.post_title_ngram, {$wpdb->prefix}posts_ngram.post_content_ngram)\n" . " AGAINST('{$ngram_query}' IN BOOLEAN MODE)"; if ( $include_comment ) { $query .= "\n OR MATCH({$wpdb->prefix}comments_ngram.comment_content_ngram)\n" . " AGAINST('{$ngram_query}' IN BOOLEAN MODE)"; } $query .= ")\n" . " AND post_date_gmt <= '" . date('Y-m-d H:i:s') . "'\n" . " AND post_password = ''\n" . " AND (post_status = 'publish' OR post_status = 'private')\n" . "ORDER BY\n" . " score DESC, post_date_gmt DESC\n" . "LIMIT\n" . " " . $limit . ", " . $limit_diff; return $query; } /** * Generate search query * @param string $query what user input in search box * @param integer $N * @return string SQL full-text search query */ function bfs_BuildSearchQuery($query, $N=2) { global $wp_query, $wpdb, $wp_version; if ( !empty($_GET['s']) ) { $query = bfs_GetSearchQuery($_GET['s'], $N); } //echo ""; //For debugging return $query; } /** * Add Bigram Full-Text Search options page to Options sub-menu */ function bfs_AdminMenu() { add_options_page('Bigram Full-Text Search Options', 'Bigram Full-Text Search', 5, basename(__FILE__), 'bfs_OptionsPage'); } function bfs_ByteAutoUnit($byte, $precision=2) { if ( $byte >= 1048576 ) { return round($byte/1048576, $precision) . ' MB'; } elseif ( $byte >= 1024 ) { return round($byte/1024, $precision) . ' KB'; } else { return $byte . ' bytes'; } } /** * Bigram Full-Text Search options page */ function bfs_OptionsPage() { global $wpdb; //Initialize update flag $update = FALSE; //Get requests if ( !empty($_POST['rebuild_ngram']) ) { bfs_BuildNgramTable($_POST['rebuild_ngram']); } elseif ( !empty($_POST['uninstall']) && $_POST['uninstall'] == 'Uninstall' ) { //Drop n-gram table $wpdb->query("DROP TABLE IF EXISTS '{$wpdb->prefix}posts_ngram'"); $wpdb->query("DROP TABLE IF EXISTS '{$wpdb->prefix}comments_ngram'"); //Delete all options delete_option('bcse_bfs_version'); delete_option('bcse_bfs_include_comment'); delete_option('bcse_bfs_strip_html'); //Deactive plugin $current_active_plugins = get_option('active_plugins'); $plugin_filename = $_GET['page']; foreach ( $current_active_plugins as $key => $plugin ) { if ( strstr($plugin, $plugin_filename) ) { $plugin_id = $key; break; } } array_splice($current_active_plugins, $plugin_id, 1); update_option('active_plugins', $current_active_plugins); //Done! :p wp_redirect('plugins.php?deactivate=true'); } elseif ( !empty($_POST['submit']) ) { // "Search in comments" $include_comment = ( !empty($_POST['include_comment']) && $_POST['include_comment'] == 'true' ) ? TRUE : FALSE ; update_option('bcse_bfs_include_comment', $include_comment); // "Strip HTML" $strip_html = ( !empty($_POST['strip_html']) && $_POST['strip_html'] == 'true' ) ? TRUE : FALSE ; update_option('bcse_bfs_strip_html', $strip_html); // "Stop words list" $stop_words = !empty($_POST['stop_words']) ? $wpdb->escape($_POST['stop_words']) : array() ; update_option('bcse_bfs_stop_words', $stop_words); $update = TRUE; } //Get posts n-gram table status $posts_ngram_table_status = $wpdb->get_row("SHOW TABLE STATUS LIKE '{$wpdb->prefix}posts_ngram'"); $posts_ngram_data_length = bfs_ByteAutoUnit($posts_ngram_table_status->Data_length); $posts_ngram_index_length = bfs_ByteAutoUnit($posts_ngram_table_status->Index_length); //Get comments n-gram table status $comments_ngram_table_status = $wpdb->get_row("SHOW TABLE STATUS LIKE '{$wpdb->prefix}comments_ngram'"); $comments_ngram_data_length = bfs_ByteAutoUnit($comments_ngram_table_status->Data_length); $comments_ngram_index_length = bfs_ByteAutoUnit($comments_ngram_table_status->Index_length); //Get options $include_comment = get_option('bcse_bfs_include_comment'); $strip_html = get_option('bcse_bfs_strip_html'); $stop_words = get_option('bcse_bfs_stop_words'); if ( $update ) { echo '

Options saved.

'; } echo '

Search Options

Disabling this option will remove HTML tags in n-gram, so that you can not search it. If your HTML tags or attributes contain important keywords, do not disable this option.

Stop words is the name given to words which are filtered out prior to, or after, processing of natural language data (text).

Manage n-gram tables

Table status

Table name: ' . $posts_ngram_table_status->Name . '
Rows: ' . $posts_ngram_table_status->Rows . ' rows
Data length: ' . $posts_ngram_data_length . '
Index length: ' . $posts_ngram_index_length . '
Table name: ' . $comments_ngram_table_status->Name . '
Rows: ' . $comments_ngram_table_status->Rows . ' rows
Data length: ' . $comments_ngram_data_length . '
Index length: ' . $comments_ngram_index_length . '

Rebuild n-gram data

Which table to rebuild?

'; } /** * Check if Bigram Full-Text Search is installed or deprecated. * If so, then install or upgrade it. */ function bfs_CheckInstall() { global $wpdb; $bcse_bfs_version = get_option('bcse_bfs_version'); if ( $bcse_bfs_version == '' ) $bcse_bfs_version = 0; if ( version_compare($bcse_bfs_version, '0.2', '<') ) { $exists_posts_ngram_table = is_object($wpdb->get_row("SHOW TABLES LIKE '{$wpdb->prefix}posts_ngram'")); $exists_comments_ngram_table = is_object($wpdb->get_row("SHOW TABLES LIKE '{$wpdb->prefix}comments_ngram'")); if ( !$exists_posts_ngram_table || !$exists_comments_ngram_table ) { //Create or upgrade n-gram table $ngram_schema = "CREATE TABLE {$wpdb->prefix}posts_ngram (" . " ID bigint(20) unsigned NOT NULL default '0'," . " post_title_ngram text NOT NULL," . " post_content_ngram longtext NOT NULL," . " PRIMARY KEY (ID)," . " FULLTEXT KEY post_title (post_title_ngram,post_content_ngram)," . " FULLTEXT KEY post_content (post_content_ngram)" . ") ENGINE=MyISAM $charset_collate;" . "CREATE TABLE {$wpdb->prefix}comments_ngram (" . " ID bigint(20) unsigned NOT NULL default '0'," . " comment_content_ngram longtext NOT NULL," . " PRIMARY KEY (ID)," . " FULLTEXT KEY comment_content (comment_content_ngram)" . ") ENGINE=MyISAM $charset_collate;"; require_once(ABSPATH . 'wp-admin/upgrade-functions.php'); dbDelta($ngram_schema); } //Build initial n-gram data if ( !$exists_posts_ngram_table ) { bfs_BuildNgramTable('posts'); } if ( !$exists_comments_ngram_table ) { bfs_BuildNgramTable('comments'); } //Record installed version update_option('bcse_bfs_version', '0.2'); //Some default values //update_option('bcse_bfs_include_comment', TRUE); //update_option('bcse_bfs_strip_html', FALSE); } } /** * Parse search criteria * @param string $str search criteria * @return string boolean full-text search query */ function bfs_ParseOperators($str, $N=2) { $str = ' ' . $str; //這樣接下來比較簡單 if ( strstr($str, ' +') || strstr($str, ' -') || strstr($str, ' ~') || strstr($str, '* ') ) { preg_match_all("/\s+[+\-~]?(\"[^\"]+\"|[^\s]+\*?)/", $str, $out, PREG_PATTERN_ORDER); $operator = array( 'plus' => null, 'minus' => null, 'no' => null, 'tilde' => null, 'asterisk' => null ); //依照 operator 分類 foreach ( $out[0] as $o ) { if ( substr($o, -1) == '*' ) { $operator['asterisk'] .= $o; } else if ( $o{1} == '+' ) { $operator['plus'] .= substr($o, 2) . ' '; } else if ( $o{1} == '-' ) { $operator['minus'] .= substr($o, 2) . ' '; } else if ( $o{1} == '~' ) { $operator['tilde'] .= substr($o, 2) . ' '; } else { $operator['no'] .= $o; } } //依照 operator 合併各關鍵字 $operator['plus'] = !empty($operator['plus']) ? bfs_StringToNgram($operator['plus'], $N, TRUE, ' +') : '' ; $operator['minus'] = !empty($operator['minus']) ? bfs_StringToNgram($operator['minus'], $N, TRUE, ' -') : '' ; $operator['no'] = !empty($operator['no']) ? bfs_StringToNgram($operator['no'], $N, TRUE, ' ') : '' ; $operator['tilde'] = !empty($operator['tilde']) ? bfs_StringToNgram($operator['tilde'], $N, TRUE, ' ~') : '' ; //全部混在一起作成完整的 query return implode($operator, ''); } else { //如果整句 search criteria 中沒有任何 operator,就全部算 AND search return bfs_StringToNgram($str, $N, TRUE, ' +'); } } /** * Generate a sub-sequence of n items from a given sequence. * @param string $str given sequence * @param integer $N * @param boolean $do_escape escape the sub-sequences and combine them with $separator * @param string $separator * @return mixed sub-sequence array or escaped sub-sequence string */ function bfs_StringToNgram($str, $N=2, $do_escape=FALSE, $separator=' ') { if ( get_option('bcse_bfs_strip_html') ) { $str = strip_tags($str); } $stop_words = get_option('bcse_bfs_stop_words'); if ( $stop_words ) { $stop_words = preg_split("/(\r|\n|\r\n)+/", $stop_words); $str = str_replace($stop_words, ' ', $str); } //從字串中取得表意文關鍵字 preg_match_all("/[一-龥ぁ-んァ-ヴー]+|[가-힣]+/u", $str, $out); //暴力切割表意文字(只有一個字的字串將被忽略) $ngram = null; //To avoid some notice and warning... foreach ( $out[0] as $o ) { $olen = mb_strlen($o, 'UTF-8'); for ($i=0; $i<=$olen-$N; $i++) { $ngram[] = mb_substr($o, $i, $N, 'UTF-8'); } } //從字串中取得歐文關鍵字 preg_match_all("/[0-9a-zA-ZÀ-ÖØ-öø-ȳ]+|[Ά-Ͽ]+|[Ѐ-ԓ]+|[0-9a-zA-Z]+/u", $str, $out); //Escape if ( $do_escape ) { $ngram = is_array($ngram) ? bfs_EncodeMbstringToHex(implode($separator, $ngram)) : '' ; $ngram .= is_array($out[0]) && count($out[0]) > 0 ? $separator . implode($separator, $out[0]) : ''; } else { //將歐文關鍵字存入關鍵字陣列 $ngram = array_merge($out[0], $ngram); } return $ngram; } /** * A easy way to convert UTF-8 multi-byte charaters to numeric UTF-8 (Hex) * @param string $str raw UTF-8 string * @return string encoded string */ function bfs_EncodeMbstringToHex($str) { $str = preg_replace(array('/\+/', '/%2B/', '/[%]{1}/'), array(' ', '+', ''), urlencode($str)); return $str; } /** * Experimental: Bigram Related Entries * @param integer $id post_ID * @return mixed */ function bfs_GetRelatedEntries($result_limit=10, $score_limit=0, $before_title = '
  • ', $after_title = '
  • ') { global $wpdb, $post; $post_title_ngram = $wpdb->get_var("SELECT post_title_ngram FROM {$wpdb->prefix}posts_ngram WHERE ID = $post->ID"); $related = $wpdb->get_results( "SELECT {$wpdb->posts}.ID, {$wpdb->posts}.post_title, (MATCH ({$wpdb->prefix}posts_ngram.post_title_ngram) AGAINST ('{$post_title_ngram}' IN BOOLEAN MODE)*10) + MATCH ({$wpdb->prefix}posts_ngram.post_content_ngram) AGAINST ('{$post_title_ngram}') AS score " . "FROM {$wpdb->posts} INNER JOIN {$wpdb->prefix}posts_ngram USING (ID) " . "WHERE {$wpdb->posts}.ID <> $post->ID " . " AND post_date_gmt <= '" . date('Y-m-d H:i:s') . "' " . " AND post_status IN ('publish', 'static') " . " AND post_password = '' " . "HAVING score > $score_limit " . "ORDER BY score DESC " . "LIMIT {$result_limit}" ); $output = ''; if ( $related ) { foreach ( $related as $result ) { $title = stripslashes(apply_filters('the_title', $result->post_title)); $permalink = get_permalink($result->ID); $output .= $before_title .'' . $title . '' . $after_title; } echo $output; } else { echo $before_title.'No related posts'.$after_title; } } bfs_CheckInstall(); add_action('admin_menu', 'bfs_AdminMenu'); add_action('delete_post', 'bfs_DeleteNgramRow', 1); add_action('publish_post', 'bfs_UpdatePostNgramRow', 1); if ( get_option('bcse_bfs_include_comment') ) { //如果設定不搜尋迴響的話就不要繼續更新迴響的 N-gram 表格 add_action('comment_post', 'bfs_UpdateCommentNgramRow', 1); add_action('edit_comment', 'bfs_UpdateCommentNgramRow', 1); add_action('delete_comment', 'bfs_UpdateCommentNgramRow', 1); } add_filter('posts_request', 'bfs_BuildSearchQuery'); ?>