Расчленяем почту

pva · Отправлено: **20:51, 10-11-2013** | #2

первый эксперимент. В принципе нормально себя показывает когда искомая строка гарантировано есть в тексте. Когда нет или содержит небольшие ошибки, то находит не всегда то, что хотелось бы. Хотя придраться сложно, текст действительно похож. И появилась мысль, как выдирать репосты (по сути репост - это плагиат)

Код:

#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
#include <functional>
#include <numeric>
#include <list>
#include <valarray>
using namespace std;

template<typename Iterator2, typename Iterator1, typename OutputIterator,
	typename Value, typename Plus, typename Times>
void corelate(Iterator2 sample_first, Iterator2 sample_last, Iterator1 data_first,
		OutputIterator result_first, OutputIterator result_last,
		Value init, Plus plus, Times times)
{
	for(; result_first!=result_last; ++data_first,++result_first) {
		*result_first = inner_product(sample_first, sample_last, data_first, init, plus, times);
	}
}

template<typename T>
unsigned fuzzy_match(T const& a, T const& b) {
	return a==b ? 100 : 0;
}

template<>
unsigned fuzzy_match(string const& a, string const& b) {
	string::size_type a_size=a.size(), b_size=b.size();
	plus<unsigned> plus;
	unsigned accum;

	if (a_size && b_size) {
		switch(a_size-b_size) {
		case 0:
			accum = inner_product(a.begin(), a.end(), b.begin(),
						unsigned(), plus, fuzzy_match<string::value_type>)
				+ inner_product(b.begin()+1, b.end(), a.begin(),
						unsigned(), plus, fuzzy_match<string::value_type>)
				+ inner_product(a.begin()+1, a.end(), b.begin(),
						unsigned(), plus, fuzzy_match<string::value_type>);
			break;

		case 1:
			accum = inner_product(b.begin(), b.end(), a.begin(),
						unsigned(), plus, fuzzy_match<string::value_type>)
				+ inner_product(b.begin(), b.end(), a.begin()+1,
						unsigned(), plus, fuzzy_match<string::value_type>);
			break;

		case ~0u:
			accum = inner_product(a.begin(), a.end(), b.begin(),
						unsigned(), plus, fuzzy_match<string::value_type>)
				+ inner_product(a.begin(), a.end(), b.begin()+1,
						unsigned(), plus, fuzzy_match<string::value_type>);
			break;

		default:
			return 0;
		}

		return 2*accum / (a_size + b_size);
	}
	return 0;
}

template<typename T>
unsigned fuzzy_search(list<T> const& sample, list<T> const& source,
		valarray<unsigned> &match,
		unsigned spread_size, unsigned const *spread_sensitivity)
{
	typename list<T>::size_type sample_size=sample.size(), source_size=source.size();

	if (source_size >= sample_size + spread_size) {
		match.resize(source_size - sample_size);
		unsigned *r1 = &match[0], *r2 = &match[match.size()];

		corelate(sample.begin(), sample.end(), source.begin(), r1, r2,
				unsigned(), plus<unsigned>(), fuzzy_match<T>);

		corelate(spread_sensitivity+0, spread_sensitivity+spread_size, r1, r1, r2-spread_size,
				unsigned(), plus<unsigned>(), multiplies<unsigned>());

		return match.max();
	}
	return 0;
}

void load_words(const char *name, list<string>& result) {
	filebuf in;
	in.open(name, ios_base::in);
	string word;
	int c = in.sbumpc();
	for (; c!=-1; c=in.sbumpc()) {
		if (192<=c || isalpha(c)) {
			word.clear();
			do { word.push_back(c|32), c=in.sbumpc(); }
			while (192<=c || isalpha(c));
			result.push_back(word);
		}
	}
}

int main() {
	static unsigned const spread_size = 1;
	static unsigned const accept_percent = 90;
	static unsigned const spread[] = {1,1,1,1,1,1,1,1};

	list<string> sample, text;
	valarray<unsigned> match;
	unsigned threshold;

	load_words("sample.txt", sample);
	load_words("text.txt", text);
	threshold = accept_percent*fuzzy_search(sample, text, match, spread_size, spread)/100;

	unsigned sample_size = sample.size();
	unsigned match_pos = 0;
	list<string>::iterator pos = text.begin();
	for(; match_pos!=match.size(); ++match_pos,++pos) {
		if (threshold < match[match_pos]) {
			list<string>::iterator word = pos;
			cout << match[match_pos]/sample_size << "%:";
			for(unsigned n=sample_size; n; ++word,--n) {
				cout << " " << *word;
			}
			cout << "\n";
		}
	}

	return 0;
}

можно играться spread_size, коэффициентами spread (будет сдвигаться ответ)
ответ получается понятней, когда spread_size = 1
исходный файл - text.txt
что искать - sample.txt
понимает английские и русские буквы в 1251