5.5. Analyses Attributes
You can use the com.basistech.rosette.lucene.AnalysesAttribute object to gather linguistic data about the text in a document. Depending on the language, the data may include tokens, normalized tokens, lemmas, part-of-speech tags, readings, compound components, and Semitic roots.
The Lucene sample, AnalysesAttributeSample.java, illustrates this.
package com.basistech.rosette.samples;
import com.basistech.rosette.bl.Analysis;
import com.basistech.rosette.bl.KoreanAnalysis;
import com.basistech.rosette.lucene.AnalysesAttribute;
import com.basistech.rosette.lucene.BaseLinguisticsAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Example program that does analysis with a base linguistics analyzer and demonstrates usage of the AnalysesAttribute object. This does not set up and run a Lucene index; it just shows the construction of the analysis chain.
*/
public final class AnalysesAttributeSample {
private String rootDirectory;
private String language;
private String inputPathname;
private String outputPathname;
private Analyzer rblAnalyzer;
private AnalysesAttributeSample() {
//
}
private void initialize() {
File rootPath = new File(rootDirectory);
String licensePath = new File(
rootPath, "licenses/rlp-license.xml").getAbsolutePath();
Map options = new HashMap<>();
options.put("language", language);
options.put("rootDirectory", rootDirectory);
options.put("licensePath", licensePath);
options.put("caseSensitive", "true");
options.put("nfkcNormalize", "true");
rblAnalyzer = new BaseLinguisticsAnalyzer(options);
}
private void run() throws IOException {
BufferedReader input;
try {
input = new BufferedReader(new InputStreamReader(new FileInputStream(inputPathname),
StandardCharsets.UTF_8));
input.mark(1);
int bomPerhaps = input.read();
if (bomPerhaps != 0xfeff) {
input.reset();
}
} catch (IOException ie) {
System.err.printf("Failed to open input file %s%n", inputPathname);
System.exit(1);
return;
}
TokenStream tokens = rblAnalyzer.tokenStream("", input);
tokens.reset();
CharTermAttribute charTerm = tokens.getAttribute(
CharTermAttribute.class);
TypeAttribute type = tokens.getAttribute(TypeAttribute.class);
AnalysesAttribute analysesAttribute = tokens.getAttribute(
AnalysesAttribute.class);
Writer output = new OutputStreamWriter(new FileOutputStream(outputPathname), StandardCharsets.UTF_8);
PrintWriter pr = new PrintWriter(output);
while (tokens.incrementToken()) {
Analysis selectedAnalysis = analysesAttribute.selectedAnalysis();
Analysis[] analyses = analysesAttribute.analyses();
// Skip tokens that don't have analyses.
// These include those with type and .
if (null == selectedAnalysis && null == analyses) {
continue;
}
// Print the surface form.
// (May have been lowercased by the LowerCaseFilter)
pr.format("%s\t%s%n", charTerm.toString(), type.type());
// For languages where disambiguation is not supported
if (null == selectedAnalysis) {
for (Analysis analysis : analyses) {
if (null != analysis) {
printAnalysisComponents(analysis, pr);
}
}
} else {
printAnalysisComponents(selectedAnalysis, pr);
}
}
input.close();
pr.close();
System.out.println("See " + outputPathname);
}
private void printAnalysisComponents(Analysis analysis, PrintWriter pr) throws IOException {
if (analysis.getPartOfSpeech() != null) {
pr.format("\t\tpart of speech:\t\t%s%n", analysis.getPartOfSpeech());
}
if (analysis.getCompoundComponents() != null
&& analysis.getCompoundComponents().length > 0) {
pr.format("\t\tcompound components:\t%s%n",
Arrays.toString(analysis.getCompoundComponents()));
}
// This is one way to return lemmas for every token, even if they match,
// rather than the default analysis chain behavior of omitting
// matching lemmas.
if (analysis.getLemma() != null) {
pr.format("\t\tlemma:\t\t\t%s%n", analysis.getLemma());
}
if (analysis.getNormalizedToken() != null) {
pr.format("\t\tnormalized token:\t%s%n", analysis.getNormalizedToken());
}
if (analysis.getReadings() != null
&& analysis.getReadings().length > 0) {
pr.format("\t\treadings:\t\t%s%n", Arrays.toString(analysis.getReadings()));
}
if (analysis.getSemiticRoot() != null) {
pr.format("\t\tSemitic root:\t\t%s%n", analysis.getSemiticRoot());
}
if (analysis.getStem() != null) {
pr.format("\t\tstem:\t\t\t%s%n", analysis.getStem());
}
if ("kor".equals(language)) {
assert analysis instanceof KoreanAnalysis;
KoreanAnalysis koreanAnalysis = (KoreanAnalysis) analysis;
List morphemes = koreanAnalysis.getMorphemes();
List tags = koreanAnalysis.getTags();
if (morphemes != null) {
pr.format("\t\tmorphemes:\t\t%s%n", Arrays.toString(morphemes.toArray()));
}
if (tags != null) {
pr.format("\t\ttags:\t\t\t%s%n", Arrays.toString(tags.toArray()));
}
}
}
public static void main(String[] args) {
if (args.length != 4) {
System.err.println("Usage:"
+ " com.basistech.rosette.samples.AnalysesAttributeSample "
+ "rootDirectory language input output");
return;
}
AnalysesAttributeSample that = new AnalysesAttributeSample();
that.rootDirectory = args[0];
that.language = args[1];
that.inputPathname = args[2];
that.outputPathname = args[3];
that.initialize();
try {
that.run();
} catch (IOException e) {
System.err.println("Exception processing the data.");
e.printStackTrace();
}
}
}