language_data.js 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. /*
  2. * This script contains the language-specific data used by searchtools.js,
  3. * namely the list of stopwords, stemmer, scorer and splitter.
  4. */
  5. var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
  6. /* Non-minified version is copied as a separate JS file, if available */
  7. /**
  8. * Porter Stemmer
  9. */
  10. var Stemmer = function() {
  11. var step2list = {
  12. ational: 'ate',
  13. tional: 'tion',
  14. enci: 'ence',
  15. anci: 'ance',
  16. izer: 'ize',
  17. bli: 'ble',
  18. alli: 'al',
  19. entli: 'ent',
  20. eli: 'e',
  21. ousli: 'ous',
  22. ization: 'ize',
  23. ation: 'ate',
  24. ator: 'ate',
  25. alism: 'al',
  26. iveness: 'ive',
  27. fulness: 'ful',
  28. ousness: 'ous',
  29. aliti: 'al',
  30. iviti: 'ive',
  31. biliti: 'ble',
  32. logi: 'log'
  33. };
  34. var step3list = {
  35. icate: 'ic',
  36. ative: '',
  37. alize: 'al',
  38. iciti: 'ic',
  39. ical: 'ic',
  40. ful: '',
  41. ness: ''
  42. };
  43. var c = "[^aeiou]"; // consonant
  44. var v = "[aeiouy]"; // vowel
  45. var C = c + "[^aeiouy]*"; // consonant sequence
  46. var V = v + "[aeiou]*"; // vowel sequence
  47. var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
  48. var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
  49. var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
  50. var s_v = "^(" + C + ")?" + v; // vowel in stem
  51. this.stemWord = function (w) {
  52. var stem;
  53. var suffix;
  54. var firstch;
  55. var origword = w;
  56. if (w.length < 3)
  57. return w;
  58. var re;
  59. var re2;
  60. var re3;
  61. var re4;
  62. firstch = w.substr(0,1);
  63. if (firstch == "y")
  64. w = firstch.toUpperCase() + w.substr(1);
  65. // Step 1a
  66. re = /^(.+?)(ss|i)es$/;
  67. re2 = /^(.+?)([^s])s$/;
  68. if (re.test(w))
  69. w = w.replace(re,"$1$2");
  70. else if (re2.test(w))
  71. w = w.replace(re2,"$1$2");
  72. // Step 1b
  73. re = /^(.+?)eed$/;
  74. re2 = /^(.+?)(ed|ing)$/;
  75. if (re.test(w)) {
  76. var fp = re.exec(w);
  77. re = new RegExp(mgr0);
  78. if (re.test(fp[1])) {
  79. re = /.$/;
  80. w = w.replace(re,"");
  81. }
  82. }
  83. else if (re2.test(w)) {
  84. var fp = re2.exec(w);
  85. stem = fp[1];
  86. re2 = new RegExp(s_v);
  87. if (re2.test(stem)) {
  88. w = stem;
  89. re2 = /(at|bl|iz)$/;
  90. re3 = new RegExp("([^aeiouylsz])\\1$");
  91. re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
  92. if (re2.test(w))
  93. w = w + "e";
  94. else if (re3.test(w)) {
  95. re = /.$/;
  96. w = w.replace(re,"");
  97. }
  98. else if (re4.test(w))
  99. w = w + "e";
  100. }
  101. }
  102. // Step 1c
  103. re = /^(.+?)y$/;
  104. if (re.test(w)) {
  105. var fp = re.exec(w);
  106. stem = fp[1];
  107. re = new RegExp(s_v);
  108. if (re.test(stem))
  109. w = stem + "i";
  110. }
  111. // Step 2
  112. re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
  113. if (re.test(w)) {
  114. var fp = re.exec(w);
  115. stem = fp[1];
  116. suffix = fp[2];
  117. re = new RegExp(mgr0);
  118. if (re.test(stem))
  119. w = stem + step2list[suffix];
  120. }
  121. // Step 3
  122. re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
  123. if (re.test(w)) {
  124. var fp = re.exec(w);
  125. stem = fp[1];
  126. suffix = fp[2];
  127. re = new RegExp(mgr0);
  128. if (re.test(stem))
  129. w = stem + step3list[suffix];
  130. }
  131. // Step 4
  132. re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
  133. re2 = /^(.+?)(s|t)(ion)$/;
  134. if (re.test(w)) {
  135. var fp = re.exec(w);
  136. stem = fp[1];
  137. re = new RegExp(mgr1);
  138. if (re.test(stem))
  139. w = stem;
  140. }
  141. else if (re2.test(w)) {
  142. var fp = re2.exec(w);
  143. stem = fp[1] + fp[2];
  144. re2 = new RegExp(mgr1);
  145. if (re2.test(stem))
  146. w = stem;
  147. }
  148. // Step 5
  149. re = /^(.+?)e$/;
  150. if (re.test(w)) {
  151. var fp = re.exec(w);
  152. stem = fp[1];
  153. re = new RegExp(mgr1);
  154. re2 = new RegExp(meq1);
  155. re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
  156. if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
  157. w = stem;
  158. }
  159. re = /ll$/;
  160. re2 = new RegExp(mgr1);
  161. if (re.test(w) && re2.test(w)) {
  162. re = /.$/;
  163. w = w.replace(re,"");
  164. }
  165. // and turn initial Y back to y
  166. if (firstch == "y")
  167. w = firstch.toLowerCase() + w.substr(1);
  168. return w;
  169. }
  170. }