segmentation
Class ActiveSegmentation

java.lang.Object
  extended bygate.util.AbstractFeatureBearer
      extended bygate.creole.AbstractResource
          extended bygate.creole.AbstractProcessingResource
              extended bygate.creole.AbstractLanguageAnalyser
                  extended bysegmentation.ActiveSegmentation
All Implemented Interfaces:
gate.creole.ANNIEConstants, gate.Executable, gate.util.FeatureBearer, gate.LanguageAnalyser, gate.util.NameBearer, gate.ProcessingResource, gate.Resource, Serializable

public class ActiveSegmentation
extends gate.creole.AbstractLanguageAnalyser

A sentence splitter that use an active learning procedure. Its purpose is to segment a corpus of texts into sentences but not on abbreviations. You will have to tag a sample (Parameter: corpusSampleDocument) of the full corpus (Parameter: corpus), steps 2, 3, 4, 5 and 6. At the end of the process, step 7, you will have all files in the corpus (Parameter: corpus) rewritten in new files with one sentence per line and two lists of abbreviations and capitalized words. All files will be put in the results directory (Parameter: resultsDirectoryURL). 1. Compute statistics on the full corpus 2. Use lists and statistics to do a first tagging of abbreviations 3. Learn and show the learning error on abbreviations 4. First tagging of capitalized words 5. Learn and show the learning error on capitalized words 6. Determine Cap. words according to Abbreviations and conversely 7. Apply learned model on full corpus to segment sentences

See Also:
Serialized Form

Nested Class Summary
 class ActiveSegmentation.LearnAbbreviationsDialog
          Show the user the unpredicted instances with several learning algorithms.
 class ActiveSegmentation.LearnCapitalizedWordsDialog
          Show the user the unpredicted instances with several learning algorithms.
 class ActiveSegmentation.Statistics2Dialog
          Let the user choose parameters to class word into 3 classes : certain, possible and impossible capitalized words.
 class ActiveSegmentation.StatisticsDialog
          Let the user choose parameters to class word into 3 classes : certain, possible and impossible abbreviations.
 class ActiveSegmentation.Step06Dialog
          Determine CapitalizedWord words according to Abbreviations and conversely.
 
Field Summary
static String SPLIT_CORPUS_PARAMETER_NAME
           
static String SPLIT_ENCODING_PARAMETER_NAME
           
static String SPLIT_GAZ_URL_PARAMETER_NAME
           
static String SPLIT_INPUT_AS_PARAMETER_NAME
           
static String SPLIT_LEXICON_URL_PARAMETER_NAME
           
static String SPLIT_LINE_ANNOTATIONS_TO_SKIP
           
static String SPLIT_OUTPUT_AS_PARAMETER_NAME
           
static String SPLIT_PREFIXES_URL_PARAMETER_NAME
           
static String SPLIT_RESOURCES_URL_PARAMETER_NAME
           
static String SPLIT_RESULTS_URL_PARAMETER_NAME
           
static String SPLIT_SAMPLE_DOCUMENT_PARAMETER_NAME
           
static String SPLIT_SUFFIXES_URL_PARAMETER_NAME
           
static String SPLIT_TAGS_URL_PARAMETER_NAME
           
static String SPLIT_TRANSD_URL_PARAMETER_NAME
           
 
Fields inherited from interface gate.creole.ANNIEConstants
ANNOTATION_COREF_FEATURE_NAME, DATE_ANNOTATION_TYPE, DATE_POSTED_ANNOTATION_TYPE, DOCUMENT_COREF_FEATURE_NAME, JOB_ID_ANNOTATION_TYPE, LOCATION_ANNOTATION_TYPE, LOOKUP_ANNOTATION_TYPE, LOOKUP_CLASS_FEATURE_NAME, LOOKUP_MAJOR_TYPE_FEATURE_NAME, LOOKUP_MINOR_TYPE_FEATURE_NAME, LOOKUP_ONTOLOGY_FEATURE_NAME, MONEY_ANNOTATION_TYPE, ORGANIZATION_ANNOTATION_TYPE, PERSON_ANNOTATION_TYPE, PERSON_GENDER_FEATURE_NAME, PR_NAMES, SENTENCE_ANNOTATION_TYPE, SPACE_TOKEN_ANNOTATION_TYPE, TOKEN_ANNOTATION_TYPE, TOKEN_CATEGORY_FEATURE_NAME, TOKEN_KIND_FEATURE_NAME, TOKEN_LENGTH_FEATURE_NAME, TOKEN_ORTH_FEATURE_NAME, TOKEN_STRING_FEATURE_NAME
 
Constructor Summary
ActiveSegmentation()
           
 
Method Summary
 void execute()
          Execute step by step the segmentation.
 gate.Corpus getCorpus()
           
 gate.Document getCorpusSampleDocument()
           
 String getEncoding()
           
 URL getGazetteerFileURL()
           
 String getInputASName()
           
 URL getLexiconFileURL()
           
 List getlineAnnotationsToSkipList()
           
 String getOutputASName()
           
 Boolean getPOSTagging()
           
 URL getPrefixesFileURL()
           
 URL getResourcesDirectoryURL()
           
 URL getResultsDirectoryURL()
           
 URL getSuffixesFileURL()
           
 URL getTagsFileURL()
           
 URL getTransducerDirectoryURL()
           
 gate.Resource init()
          Initialize the resources.
 void interrupt()
          Notifies all the PRs in this controller that they should stop their execution as soon as possible.
 void reInit()
          Reinitialize the resources.
 void setCorpus(gate.Corpus _corpus)
           
 void setCorpusSampleDocument(gate.Document _corpusSampleDocument)
           
 void setEncoding(String newEncoding)
           
 void setGazetteerFileURL(URL newGazetteerFileURL)
           
 void setInputASName(String newInputASName)
           
 void setLexiconFileURL(URL newLexiconFileURL)
           
 void setlineAnnotationsToSkipList(List newlineAnnotationsToSkipList)
           
 void setOutputASName(String newOutputASName)
           
 void setPOSTagging(Boolean newPOSTagging)
           
 void setPrefixesFileURL(URL newPrefixesFileURL)
           
 void setResourcesDirectoryURL(URL newResourcesDirectoryURL)
           
 void setResultsDirectoryURL(URL newResultsDirectoryURL)
           
 void setSuffixesFileURL(URL newSuffixesFileURL)
           
 void setTagsFileURL(URL newTagsFileURL)
           
 void setTransducerDirectoryURL(URL newTransducerDirectoryURL)
           
 
Methods inherited from class gate.creole.AbstractLanguageAnalyser
getDocument, setDocument
 
Methods inherited from class gate.creole.AbstractProcessingResource
addProgressListener, addStatusListener, cleanup, isInterrupted, removeProgressListener, removeStatusListener
 
Methods inherited from class gate.creole.AbstractResource
checkParameterValues, getBeanInfo, getName, getParameterValue, getParameterValue, removeResourceListeners, setName, setParameterValue, setParameterValue, setParameterValues, setParameterValues, setResourceListeners
 
Methods inherited from class gate.util.AbstractFeatureBearer
getFeatures, setFeatures
 
Methods inherited from class java.lang.Object
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 
Methods inherited from interface gate.Resource
cleanup, getParameterValue, setParameterValue, setParameterValues
 
Methods inherited from interface gate.util.FeatureBearer
getFeatures, setFeatures
 
Methods inherited from interface gate.util.NameBearer
getName, setName
 
Methods inherited from interface gate.Executable
isInterrupted
 

Field Detail

SPLIT_SAMPLE_DOCUMENT_PARAMETER_NAME

public static final String SPLIT_SAMPLE_DOCUMENT_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_CORPUS_PARAMETER_NAME

public static final String SPLIT_CORPUS_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_LINE_ANNOTATIONS_TO_SKIP

public static final String SPLIT_LINE_ANNOTATIONS_TO_SKIP
See Also:
Constant Field Values

SPLIT_INPUT_AS_PARAMETER_NAME

public static final String SPLIT_INPUT_AS_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_OUTPUT_AS_PARAMETER_NAME

public static final String SPLIT_OUTPUT_AS_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_ENCODING_PARAMETER_NAME

public static final String SPLIT_ENCODING_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_RESOURCES_URL_PARAMETER_NAME

public static final String SPLIT_RESOURCES_URL_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_GAZ_URL_PARAMETER_NAME

public static final String SPLIT_GAZ_URL_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_TRANSD_URL_PARAMETER_NAME

public static final String SPLIT_TRANSD_URL_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_PREFIXES_URL_PARAMETER_NAME

public static final String SPLIT_PREFIXES_URL_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_SUFFIXES_URL_PARAMETER_NAME

public static final String SPLIT_SUFFIXES_URL_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_TAGS_URL_PARAMETER_NAME

public static final String SPLIT_TAGS_URL_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_LEXICON_URL_PARAMETER_NAME

public static final String SPLIT_LEXICON_URL_PARAMETER_NAME
See Also:
Constant Field Values

SPLIT_RESULTS_URL_PARAMETER_NAME

public static final String SPLIT_RESULTS_URL_PARAMETER_NAME
See Also:
Constant Field Values
Constructor Detail

ActiveSegmentation

public ActiveSegmentation()
Method Detail

init

public gate.Resource init()
                   throws gate.creole.ResourceInstantiationException
Initialize the resources.

Throws:
gate.creole.ResourceInstantiationException

reInit

public void reInit()
            throws gate.creole.ResourceInstantiationException
Reinitialize the resources.

Throws:
gate.creole.ResourceInstantiationException

execute

public void execute()
             throws gate.creole.ExecutionException
Execute step by step the segmentation.

Throws:
gate.creole.ExecutionException

interrupt

public void interrupt()
Notifies all the PRs in this controller that they should stop their execution as soon as possible.


setCorpusSampleDocument

public void setCorpusSampleDocument(gate.Document _corpusSampleDocument)

getCorpusSampleDocument

public gate.Document getCorpusSampleDocument()

setCorpus

public void setCorpus(gate.Corpus _corpus)

getCorpus

public gate.Corpus getCorpus()

setlineAnnotationsToSkipList

public void setlineAnnotationsToSkipList(List newlineAnnotationsToSkipList)

getlineAnnotationsToSkipList

public List getlineAnnotationsToSkipList()

setInputASName

public void setInputASName(String newInputASName)

getInputASName

public String getInputASName()

setOutputASName

public void setOutputASName(String newOutputASName)

getOutputASName

public String getOutputASName()

setEncoding

public void setEncoding(String newEncoding)

getEncoding

public String getEncoding()

setResourcesDirectoryURL

public void setResourcesDirectoryURL(URL newResourcesDirectoryURL)

getResourcesDirectoryURL

public URL getResourcesDirectoryURL()

setGazetteerFileURL

public void setGazetteerFileURL(URL newGazetteerFileURL)

getGazetteerFileURL

public URL getGazetteerFileURL()

setTransducerDirectoryURL

public void setTransducerDirectoryURL(URL newTransducerDirectoryURL)

getTransducerDirectoryURL

public URL getTransducerDirectoryURL()

setPrefixesFileURL

public void setPrefixesFileURL(URL newPrefixesFileURL)

getPrefixesFileURL

public URL getPrefixesFileURL()

setSuffixesFileURL

public void setSuffixesFileURL(URL newSuffixesFileURL)

getSuffixesFileURL

public URL getSuffixesFileURL()

setTagsFileURL

public void setTagsFileURL(URL newTagsFileURL)

getTagsFileURL

public URL getTagsFileURL()

setLexiconFileURL

public void setLexiconFileURL(URL newLexiconFileURL)

getLexiconFileURL

public URL getLexiconFileURL()

setResultsDirectoryURL

public void setResultsDirectoryURL(URL newResultsDirectoryURL)

getResultsDirectoryURL

public URL getResultsDirectoryURL()

setPOSTagging

public void setPOSTagging(Boolean newPOSTagging)

getPOSTagging

public Boolean getPOSTagging()