проверка орфографии

pva · Отправлено: **00:11, 12-07-2012** | #5

Tonny_Bennet,
Делаем шаг первый: тупо проверяем правильность слова, но не предлагаем варианты.
словарь взят отсюда: http://files.speakrus.ru/dict/pldf-win.zip
проверяем фрагмент книги братьев Стругацких "Обитаемый остров".

Цитата Tonny_Bennet:

Правда если словарь будет большой то по-моему код будет не очень быстро работать »

На машине i3-2350M 2.30 GHz, 4G RAM, без оптимизации кода, с использованием хеш-таблицы поиск занял 45 мс - это вполне приемлимо.

Код:

#include <hash_map>
#include <string>
#include <cstring>
#include <list>
#include <fstream>
#include <iostream>
#include <ctime>
using namespace std;

// слова храним в обычном списке,
// индексируем список хеш-таблицей указателей

struct hash_fun {
	size_t operator()(const char* str) const {
		return __gnu_cxx::__stl_hash_string(str);
	}
};

struct hash_cmp {
	bool operator()(const char* a, const char* b) const {
		return strcmp(a,b)==0;
	}
};

// контейнер для слов
typedef list<string> wlist_type;
typedef __gnu_cxx::hash_map<const char*, int, hash_fun, hash_cmp> windex_type;

// проверка на русский символ
inline bool is_ru_alpha(int c) {
	return unsigned(c - 0xc0) < 0x40 || c=='ё' || c=='Ё';
}

// проверка на нерусский символ
inline bool is_not_ru_alpha(int c) {
	return 0<c && !is_ru_alpha(c);
}

// Загрузить слово в нижнем регистре.
bool get_word(streambuf& stm, string& str) {
	str.clear();

	while(is_not_ru_alpha(stm.sgetc())) {
		stm.sbumpc();
	}

	while(is_ru_alpha(stm.sgetc())) {
		str.push_back(stm.sbumpc() | 32);
	}

	return !str.empty();
}

// загрузить русский словарь.
void load_dict(streambuf *file, wlist_type& dict, windex_type& index)
{
	if (file) {
		string word;

		dict.clear();
		while(get_word(*file, word)) {
			dict.insert(dict.end(), word);
		}

		index.resize(dict.size());
		for(wlist_type::iterator a1=dict.begin(), a2=dict.end(); a1!=a2; ++a1) {
			index[a1->c_str()];
		}

		clog << "indexed " << index.size() << " words\n";
	}
}

// проверить файл
void check_file(streambuf* file, windex_type& index)
{
	static int const max_errors = 25;

	if (file) {
		string word;
		int cnt_errors = 0,
			cnt_total = 0;

		while(get_word(*file, word)) {
			++cnt_total;
			if (index.find(word.c_str())==index.end()) {
				if (++cnt_errors < max_errors) {
					cout << "spell: " << word << "\n";
				}
			}
		}

		if (max_errors <= cnt_errors) {
			cout << "   ...\n";
			clog << "showing only first " << max_errors << " errors\n";
		}

		cout << "misspelled " << cnt_errors << " of " << cnt_total << " words\n";
	}
}

int main(int argc, char** argv) {
	clock_t start_time;
	int duration;
	filebuf fbuf;

	#define BENCHMARK(name, foo) \
		start_time = clock();\
		foo; \
		duration = clock() - start_time;\
		clog << name " spent " << double(duration)/CLOCKS_PER_SEC << " secs\n";\

	wlist_type dict;
	windex_type index;

	BENCHMARK("load dictionary", load_dict(fbuf.open("pldf-win.txt", ios_base::in), dict, index));
	fbuf.close();

	BENCHMARK("spell check", check_file(fbuf.open("strugackie_ostrov.txt", ios_base::in), index));
	fbuf.close();

	return 0;
}

Вывод:

Код:

spell: ладошек
spell: ладошки
spell: солнцу
spell: карманные
spell: компьютеры
spell: стругацкие
spell: первая
spell: приоткрыл
spell: высунулся
spell: поглядел
spell: было
spell: низкое
spell: какое
spell: твердое
spell: этой
spell: легкомысленной
spell: прозрачности
spell: намекающей
spell: космоса
spell: обитаемых
spell: миров
spell: настоящая
spell: библейская
spell: гладкая
   ...
misspelled 41216 of 93652 words
indexed 125140 words
load dictionary spent 0.272 secs
showing only first 25 errors
spell check spent 0.045 secs

PhilB, статью на хабре читал, но ума понять расстояние Ливенштейна не хватило (суть понятна, как считать - непонятно). Можель объяснить его для тупых?

Beyound, судя по "ошибкам", которые выдаёт программа на основании словаря из 120К слов, двигаться нужно в твоём направлении. Типа метод триграмм, только разбивать по частям слова. Но тогда нужно добавлять ограничение на применимость частей слова друг к другу?