/* Copyright 2004-2011 by the National and Technical University of Athens This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . */ /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - This C-version is created by Jan Wielemaker, starting from an almost literal translation from the Java code. From there I did some minor reorganisation, for example to get score_inplace(), which modifies the argument strings, but this doesn't matter for the Prolog interface if as we must ask for BUF_MALLOC anyway. Some timing: AMD Athlon 5400+ Comparing "E56.Language" <-> "languange" (0.711348), 1,000,000 times: Java (OpenJDK 1.6.0_20): 1.89 sec C (Gcc -O2) 0.72 sec Through Prolog 1.05 sec - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ /** * @author Giorgos Stoilos * * This class implements the string matching method proposed in the paper * "A String Metric For Ontology Alignment", published in ISWC 2005 * * Jérōme Euzenat: added normalization */ #ifndef STAND_ALONE #include #endif #define _GNU_SOURCE /* get wcsdup */ #define _CRT_SECURE_NO_WARNINGS 1 #include #include #include #include #include #include "isub.h" #include "wcsdup.ic" #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) #define DEBUG(g) (void)(0) // Default is [-1,1] range, and skip normalization #define ZERO_TO_ONE (0x1) // Result in [0,1] range, [-1,1] if not set #define NORMALIZE (0x2) // Normalize input words static void toLowerCase(wchar_t *s) { wchar_t *q; for(q=s; *q; q++) { wint_t c = *q; if ( iswupper(c) ) { *q = towlower(c); } } } static void normalizeString(wchar_t *s, wint_t remo) { wchar_t *q, *o; for(o=q=s; *q; q++) { if ( *q != remo ) *o++ = *q; } *o = 0; } static size_t common_prefix_length(const wchar_t *s1, const wchar_t *s2) { size_t i; size_t n = MIN(wcslen(s1), wcslen(s2)); for (i = 0; i < n; i++) { if (s1[i] != s2[i]) break; } return i; } double isub_score_inplace(wchar_t *s1, wchar_t *s2, int options, int substring_threshold) { int l1, l2, L1, L2; double common = 0.0; size_t common_prefix_len; int best = 2; if ( options & NORMALIZE ) { toLowerCase(s1); toLowerCase(s2); normalizeString(s1, '.'); normalizeString(s2, '.'); normalizeString(s1, '_'); normalizeString(s2, '_'); normalizeString(s1, ' '); normalizeString(s2, ' '); } common_prefix_len = common_prefix_length(s1, s2); l1 = (int)wcslen(s1); // length of s l2 = (int)wcslen(s2); // length of t L1 = l1; L2 = l2; if ((L1 == 0) && (L2 == 0)) return 1.0; // Modification JE: giorgos put -1 instead of 0 if ((L1 == 0) || (L2 == 0)) return 0; while ( l1 > 0 && l2 > 0 && best != 0) { int i = 0; // iterates through s1 int j = 0; // iterates through s2 int startS2 = 0; int endS2 = 0; int startS1 = 0; int endS1 = 0; int p = 0; best = 0; // the best subs length so far for (i = 0; (i < l1) && (l1 - i > best); i++) { j = 0; while (l2 - j > best) { int k = i; for (; (j < l2) && (s1[k] != s2[j]); j++) ; if (j != l2) { // we have found a starting point p = j; for ( j++, k++; (j < l2) && (k < l1) && (s1[k] == s2[j]); j++, k++ ); if (k - i > best) { best = k - i; startS1 = i; endS1 = k; startS2 = p; endS2 = j; } } } } DEBUG(wprintf(L"%ls(%d) %d..%d; %ls(%d) %d..%d -->", s1, l1, startS1, endS1, s2, l2, startS2, endS2)); memmove(&s1[startS1], &s1[endS1], (l1+1-endS1)*sizeof(wchar_t)); memmove(&s2[startS2], &s2[endS2], (l2+1-endS2)*sizeof(wchar_t)); l1 -= endS1-startS1; l2 -= endS2-startS2; if (best > substring_threshold) // Original algorighm used best > 2 common += best; else best = 0; } { double scaledCommon = (double) (2.0 * common) / (double)(L1 + L2); double commonality = scaledCommon; double dissimilarity = 0.0; double rest1 = (double)L1 - common; double rest2 = (double)L2 - common; double unmatchedS1 = rest1 / (double)L1; double unmatchedS2 = rest2 / (double)L2; double result; /** * Hamacher Product */ double suma = unmatchedS1 + unmatchedS2; double product = unmatchedS1 * unmatchedS2; double p = 0.6; //For 1 it coincides with the algebraic product double winklerImprovementVal = (double)MIN(4,common_prefix_len)*0.1*(1.0-commonality); if ((suma - product) == 0) dissimilarity = 0; else dissimilarity = (product) / (p + (1 - p) * (suma - product)); // Modification JE: returned normalization (instead of [-1 1]) result = commonality - dissimilarity + winklerImprovementVal; if ( options & ZERO_TO_ONE ) return (result + 1) / 2; else return result; // Original algorithm returns result in [-1,1] } } double isub_score(const wchar_t *st1, const wchar_t *st2, int options, int substring_threshold) { wchar_t *s1 = wcsdup(st1); wchar_t *s2 = wcsdup(st2); double rc; rc = isub_score_inplace(s1, s2, options, substring_threshold); free(s1); free(s2); return rc; } #ifdef STAND_ALONE int main(int argc, char **argv) { mbstate_t state; wchar_t ws1[1024]; wchar_t ws2[1024]; const char *s1 = argv[1]; const char *s2 = argv[2]; size_t l1, l2; memset(&state, 0, sizeof(state)); l1 = mbsrtowcs(ws1, &s1, 1024, &state); l2 = mbsrtowcs(ws2, &s2, 1024, &state); double total=0.0; int i; for(i=0; i<1000000; i++) total += isub_score(ws1, ws2, NORMALIZE, 2); total = isub_score(ws1, ws2, ZERO_TO_ONE|NORMALIZE, 0); wprintf(L"zero_to_one, normalize, threshold=0: %ls %ls: %f\n", ws1, ws2, total); total = isub_score(ws1, ws2, NORMALIZE, 0); wprintf(L"minus_one_to_one, normalize, threshold=0: %ls %ls: %f\n", ws1, ws2, total); total = isub_score(ws1, ws2, ZERO_TO_ONE|NORMALIZE, 2); wprintf(L"zero_to_one, normalize, threshold=2: %ls %ls: %f\n", ws1, ws2, total); total = isub_score(ws1, ws2, NORMALIZE, 2); wprintf(L"minus_one_to_one, normalize, threshold=2: %ls %ls: %f\n", ws1, ws2, total); return 0; } #endif