tbrianjones/whitespacePhraseTokenizer.php

## whitespacePhraseTokenizer.php
<?php

namespace NlpTools\Tokenizers;

/**
 * White space phrase tokenizer.
 * Break on every white space
 * Create ngrams with the specified number of words ( $n )
 */
class WhitespacePhraseTokenizer implements TokenizerInterface
{

    private $n; // phrase word length

    const PATTERN = '/[\pZ\pC]+/u';

    public function set_n( $n ) {
      $this->n = $n;
    }

    public function tokenize( $str )
    {

        // generate unigrams
        $unigrams = preg_split(self::PATTERN,$str,null,PREG_SPLIT_NO_EMPTY);
        $num_unigrams = count( $unigrams );

        // generate other nGrams
        $ngrams = array();
        for( $n=2; $n<=$this->n; $n++ ) {
          // loop through each unigram location in the text
          for( $i=0; $i<=$num_unigrams-$n; $i++ ) {
            $key = $i;
            $ngram = array();
            for( $key=$i; $key<$i+$n; $key++ )
              $ngram[] = $unigrams[$key];
            $ngrams[] = implode( ' ', $ngram );
          }
        }

        // combine unigrams with new ngrams
        $ngrams = array_merge( $unigrams, $ngrams );

        return $ngrams;

    }
}
	<?php

	namespace NlpTools\Tokenizers;

	/**
	* White space phrase tokenizer.
	* Break on every white space
	* Create ngrams with the specified number of words ( $n )
	*/
	class WhitespacePhraseTokenizer implements TokenizerInterface
	{

	private $n; // phrase word length

	const PATTERN = '/[\pZ\pC]+/u';

	public function set_n( $n ) {
	$this->n = $n;
	}

	public function tokenize( $str )
	{

	// generate unigrams
	$unigrams = preg_split(self::PATTERN,$str,null,PREG_SPLIT_NO_EMPTY);
	$num_unigrams = count( $unigrams );

	// generate other nGrams
	$ngrams = array();
	for( $n=2; $n<=$this->n; $n++ ) {
	// loop through each unigram location in the text
	for( $i=0; $i<=$num_unigrams-$n; $i++ ) {
	$key = $i;
	$ngram = array();
	for( $key=$i; $key<$i+$n; $key++ )
	$ngram[] = $unigrams[$key];
	$ngrams[] = implode( ' ', $ngram );
	}
	}

	// combine unigrams with new ngrams
	$ngrams = array_merge( $unigrams, $ngrams );

	return $ngrams;

	}
	}