Last active
October 5, 2015 21:58
-
-
Save nolanlawson/2883484 to your computer and use it in GitHub Desktop.
Slight tweak of the Solr SpellCheckComponent to make its output quieter when you only care about the collations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
<modelVersion>4.0.0</modelVersion> | |
<groupId>org.healthonnet.solr</groupId> | |
<artifactId>quiet-spellcheck-component</artifactId> | |
<version>0.0.1-SNAPSHOT</version> | |
<dependencies> | |
<dependency> | |
<groupId>org.apache.solr</groupId> | |
<artifactId>solr-solrj</artifactId> | |
<version>3.6.2</version> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.lucene</groupId> | |
<artifactId>lucene-core</artifactId> | |
<version>3.6.2</version> | |
</dependency> | |
<dependency> | |
<groupId>org.apache.solr</groupId> | |
<artifactId>solr-core</artifactId> | |
<version>3.6.2</version> | |
</dependency> | |
<dependency> | |
<groupId>junit</groupId> | |
<artifactId>junit</artifactId> | |
<version>4.10</version> | |
<scope>test</scope> | |
</dependency> | |
</dependencies> | |
<build> | |
<plugins> | |
<plugin> | |
<groupId>org.apache.maven.plugins</groupId> | |
<artifactId>maven-compiler-plugin</artifactId> | |
<version>3.0</version> | |
<configuration> | |
<source>1.5</source> | |
<target>1.5</target> | |
</configuration> | |
</plugin> | |
</plugins> | |
</build> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.healthonnet.solr; | |
import java.io.IOException; | |
import java.io.StringReader; | |
import java.util.ArrayList; | |
import java.util.Collection; | |
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.Token; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; | |
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | |
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; | |
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | |
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.solr.common.SolrException; | |
import org.apache.solr.common.params.CommonParams; | |
import org.apache.solr.common.params.ShardParams; | |
import org.apache.solr.common.params.SolrParams; | |
import org.apache.solr.common.util.NamedList; | |
import org.apache.solr.common.util.SimpleOrderedMap; | |
import org.apache.solr.handler.component.ResponseBuilder; | |
import org.apache.solr.handler.component.SpellCheckComponent; | |
import org.apache.solr.spelling.SolrSpellChecker; | |
import org.apache.solr.spelling.SpellingOptions; | |
import org.apache.solr.spelling.SpellingResult; | |
/** | |
* Wrapper around the SpellCheckComponent that gives a more quiet output. It | |
* only gives collations; does not give any suggestions. This is useful when you | |
* set spellcheck.count to a large number, but you really only want to check the | |
* collations. It avoids writing a ton of output and therefore cuts down on I/O. | |
* | |
* @author nolan | |
* | |
*/ | |
public class QuietSpellCheckComponent extends SpellCheckComponent { | |
@Override | |
@SuppressWarnings("unchecked") | |
public void process(ResponseBuilder rb) throws IOException { | |
SolrParams params = rb.req.getParams(); | |
if (!params.getBool(COMPONENT_NAME, false) || spellCheckers.isEmpty()) { | |
return; | |
} | |
boolean shardRequest = "true".equals(params.get(ShardParams.IS_SHARD)); | |
String q = params.get(SPELLCHECK_Q); | |
SolrSpellChecker spellChecker = getSpellChecker(params); | |
Collection<Token> tokens = null; | |
if (q != null) { | |
// we have a spell check param, tokenize it with the query analyzer | |
// applicable for this spellchecker | |
tokens = getTokens(q, spellChecker.getQueryAnalyzer()); | |
} else { | |
q = rb.getQueryString(); | |
if (q == null) { | |
q = params.get(CommonParams.Q); | |
} | |
tokens = queryConverter.convert(q); | |
} | |
if (tokens != null && tokens.isEmpty() == false) { | |
if (spellChecker != null) { | |
int count = params.getInt(SPELLCHECK_COUNT, 1); | |
boolean onlyMorePopular = params.getBool(SPELLCHECK_ONLY_MORE_POPULAR, DEFAULT_ONLY_MORE_POPULAR); | |
boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS, false); | |
NamedList response = new SimpleOrderedMap(); | |
IndexReader reader = rb.req.getSearcher().getReader(); | |
boolean collate = params.getBool(SPELLCHECK_COLLATE, false); | |
float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE); | |
SolrParams customParams = getCustomParams(getDictionaryName(params), params, shardRequest); | |
SpellingOptions options = new SpellingOptions(tokens, reader, | |
count, onlyMorePopular, extendedResults, accuracy, | |
customParams); | |
SpellingResult spellingResult = spellChecker.getSuggestions(options); | |
if (spellingResult != null) { | |
NamedList suggestions = toNamedList(shardRequest, | |
spellingResult, q, extendedResults, collate); | |
if (collate) { | |
addCollationsToResponse(params, spellingResult, rb, q, suggestions); | |
} | |
// clear the spellingResult after making collations from it. | |
// This is the only change I'm making compared to | |
// SpellCheckComponent! | |
for (int i = suggestions.size() - 1; i >= 0; i--) { | |
if ("collation".equals(suggestions.getName(i))) { | |
Object val = suggestions.getVal(i); | |
if (val instanceof NamedList | |
&& ((NamedList) val).size() > 0 | |
&& "collationQuery".equals(((NamedList) val).getName(0))) { | |
continue; | |
} | |
} | |
// otherwise it's a non-collation; delete it | |
suggestions.remove(i); | |
} | |
response.add("suggestions", suggestions); | |
rb.rsp.add("spellcheck", response); | |
} | |
} else { | |
throw new SolrException(SolrException.ErrorCode.NOT_FOUND, | |
"Specified dictionary does not exist."); | |
} | |
} | |
} | |
private Collection<Token> getTokens(String q, Analyzer analyzer) | |
throws IOException { | |
Collection<Token> result = new ArrayList<Token>(); | |
TokenStream ts = analyzer.reusableTokenStream("", new StringReader(q)); | |
ts.reset(); | |
// TODO: support custom attributes | |
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); | |
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); | |
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); | |
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); | |
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); | |
PositionIncrementAttribute posIncAtt = ts | |
.addAttribute(PositionIncrementAttribute.class); | |
while (ts.incrementToken()) { | |
Token token = new Token(); | |
token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); | |
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); | |
token.setType(typeAtt.type()); | |
token.setFlags(flagsAtt.getFlags()); | |
token.setPayload(payloadAtt.getPayload()); | |
token.setPositionIncrement(posIncAtt.getPositionIncrement()); | |
result.add(token); | |
} | |
ts.end(); | |
ts.close(); | |
return result; | |
} | |
private String getDictionaryName(SolrParams params) { | |
String dictName = params.get(SPELLCHECK_DICT); | |
if (dictName == null) { | |
dictName = SolrSpellChecker.DEFAULT_DICTIONARY_NAME; | |
} | |
return dictName; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment