Skip to content

Commit

Permalink
Ngram hashing to estimator (dotnet#1811)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ivanidzo4ka authored Dec 13, 2018
1 parent 67c4da5 commit ad57f02
Show file tree
Hide file tree
Showing 54 changed files with 3,374 additions and 1,529 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ public static class ConversionsExtensionsCatalog
/// <param name="inputColumn">Name of the input column.</param>
/// <param name="outputColumn">Name of the column to be transformed. If this is null '<paramref name="inputColumn"/>' will be used.</param>
/// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
/// <param name="invertHash">Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.</param>
/// <param name="invertHash">During hashing we constuct mappings between original values and the produced hash values.
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
/// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, string inputColumn, string outputColumn = null,
int hashBits = HashDefaults.HashBits, int invertHash = HashDefaults.InvertHash)
=> new HashingEstimator(CatalogUtils.GetEnvironment(catalog), inputColumn, outputColumn, hashBits, invertHash);
Expand Down Expand Up @@ -99,8 +102,8 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.
/// <param name="inputColumn">Name of the column to be transformed.</param>
/// <param name="outputColumn">Name of the output column. If this is null '<paramref name="inputColumn"/>' will be used.</param>
/// <param name="maxNumTerms">Maximum number of keys to keep per column when auto-training.</param>
/// <param name="sort">How items should be ordered when vectorized. By default, they will be in the order encountered.
/// If by value items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
/// <param name="sort">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingTransformer.SortOrder.Occurrence"/> choosen they will be in the order encountered.
/// If <see cref="ValueToKeyMappingTransformer.SortOrder.Value"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog,
string inputColumn,
string outputColumn = null,
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Transforms/DropSlotsTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ public sealed class ColumnInfo
/// Describes how the transformer handles one input-output column pair.
/// </summary>
/// <param name="input">Name of the input column.</param>
/// <param name="output">Name of the column resulting from the transformation of <paramref name="input"/>. Null means <paramref name="input"/> is replaced. </param>
/// <param name="output">Name of the column resulting from the transformation of <paramref name="input"/>. Null means <paramref name="input"/> is replaced.</param>
/// <param name="slots">Ranges of indices in the input column to be dropped. Setting max in <paramref name="slots"/> to null sets max to int.MaxValue.</param>
public ColumnInfo(string input, string output = null, params (int min, int? max)[] slots)
{
Expand Down Expand Up @@ -252,7 +252,7 @@ private static VersionInfo GetVersionInfo()
/// </summary>
/// <param name="env">The environment to use.</param>
/// <param name="input">Name of the input column.</param>
/// <param name="output">Name of the column resulting from the transformation of <paramref name="input"/>. Null means <paramref name="input"/> is replaced. </param>
/// <param name="output">Name of the column resulting from the transformation of <paramref name="input"/>. Null means <paramref name="input"/> is replaced.</param>
/// <param name="min">Specifies the lower bound of the range of slots to be dropped. The lower bound is inclusive. </param>
/// <param name="max">Specifies the upper bound of the range of slots to be dropped. The upper bound is exclusive.</param>
public SlotsDroppingTransformer(IHostEnvironment env, string input, string output = null, int min = default, int? max = null)
Expand Down
10 changes: 8 additions & 2 deletions src/Microsoft.ML.Data/Transforms/Hashing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,10 @@ public sealed class ColumnInfo
/// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
/// <param name="seed">Hashing seed.</param>
/// <param name="ordered">Whether the position of each term should be included in the hash.</param>
/// <param name="invertHash">Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.</param>
/// <param name="invertHash">During hashing we constuct mappings between original values and the produced hash values.
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
/// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
public ColumnInfo(string input, string output,
int hashBits = HashingEstimator.Defaults.HashBits,
uint seed = HashingEstimator.Defaults.Seed,
Expand Down Expand Up @@ -1211,7 +1214,10 @@ internal static bool IsColumnTypeValid(ColumnType type)
/// <param name="inputColumn">Name of the column to be transformed.</param>
/// <param name="outputColumn">Name of the output column. If this is null '<paramref name="inputColumn"/>' will be used.</param>
/// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
/// <param name="invertHash">Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.</param>
/// <param name="invertHash">During hashing we constuct mappings between original values and the produced hash values.
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
/// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
public HashingEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null,
int hashBits = Defaults.HashBits, int invertHash = Defaults.InvertHash)
: this(env, new HashingTransformer.ColumnInfo(inputColumn, outputColumn ?? inputColumn, hashBits: hashBits, invertHash: invertHash))
Expand Down
Loading

0 comments on commit ad57f02

Please sign in to comment.