systemml.mllearn package¶
Submodules¶
systemml.mllearn.estimators module¶
-
class
systemml.mllearn.estimators.
LinearRegression
(sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=1e-06, C=inf, solver='newton-cg', transferUsingDF=False)¶ Bases:
systemml.mllearn.estimators.BaseSystemMLRegressor
Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables.
Examples
>>> import numpy as np >>> from sklearn import datasets >>> from systemml.mllearn import LinearRegression >>> from pyspark.sql import SparkSession >>> # Load the diabetes dataset >>> diabetes = datasets.load_diabetes() >>> # Use only one feature >>> diabetes_X = diabetes.data[:, np.newaxis, 2] >>> # Split the data into training/testing sets >>> diabetes_X_train = diabetes_X[:-20] >>> diabetes_X_test = diabetes_X[-20:] >>> # Split the targets into training/testing sets >>> diabetes_y_train = diabetes.target[:-20] >>> diabetes_y_test = diabetes.target[-20:] >>> # Create linear regression object >>> regr = LinearRegression(sparkSession, solver='newton-cg') >>> # Train the model using the training sets >>> regr.fit(diabetes_X_train, diabetes_y_train) >>> # The mean square error >>> print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
-
class
systemml.mllearn.estimators.
LogisticRegression
(sparkSession, penalty='l2', fit_intercept=True, normalize=False, max_iter=100, max_inner_iter=0, tol=1e-06, C=1.0, solver='newton-cg', transferUsingDF=False)¶ Bases:
systemml.mllearn.estimators.BaseSystemMLClassifier
Performs both binomial and multinomial logistic regression.
Examples
Scikit-learn way
>>> from sklearn import datasets, neighbors >>> from systemml.mllearn import LogisticRegression >>> from pyspark.sql import SparkSession >>> sparkSession = SparkSession.builder.getOrCreate() >>> digits = datasets.load_digits() >>> X_digits = digits.data >>> y_digits = digits.target + 1 >>> n_samples = len(X_digits) >>> X_train = X_digits[:.9 * n_samples] >>> y_train = y_digits[:.9 * n_samples] >>> X_test = X_digits[.9 * n_samples:] >>> y_test = y_digits[.9 * n_samples:] >>> logistic = LogisticRegression(sparkSession) >>> print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
MLPipeline way
>>> from pyspark.ml import Pipeline >>> from systemml.mllearn import LogisticRegression >>> from pyspark.ml.feature import HashingTF, Tokenizer >>> from pyspark.sql import SparkSession >>> sparkSession = SparkSession.builder.getOrCreate() >>> training = sparkSession.createDataFrame([ >>> (0L, "a b c d e spark", 1.0), >>> (1L, "b d", 2.0), >>> (2L, "spark f g h", 1.0), >>> (3L, "hadoop mapreduce", 2.0), >>> (4L, "b spark who", 1.0), >>> (5L, "g d a y", 2.0), >>> (6L, "spark fly", 1.0), >>> (7L, "was mapreduce", 2.0), >>> (8L, "e spark program", 1.0), >>> (9L, "a e c l", 2.0), >>> (10L, "spark compile", 1.0), >>> (11L, "hadoop software", 2.0) >>> ], ["id", "text", "label"]) >>> tokenizer = Tokenizer(inputCol="text", outputCol="words") >>> hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) >>> lr = LogisticRegression(sparkSession) >>> pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) >>> model = pipeline.fit(training) >>> test = sparkSession.createDataFrame([ >>> (12L, "spark i j k"), >>> (13L, "l m n"), >>> (14L, "mapreduce spark"), >>> (15L, "apache hadoop")], ["id", "text"]) >>> prediction = model.transform(test) >>> prediction.show()
-
class
systemml.mllearn.estimators.
SVM
(sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=1e-06, C=1.0, is_multi_class=False, transferUsingDF=False)¶ Bases:
systemml.mllearn.estimators.BaseSystemMLClassifier
Performs both binary-class and multiclass SVM (Support Vector Machines).
Examples
>>> from sklearn import datasets, neighbors >>> from systemml.mllearn import SVM >>> from pyspark.sql import SparkSession >>> sparkSession = SparkSession.builder.getOrCreate() >>> digits = datasets.load_digits() >>> X_digits = digits.data >>> y_digits = digits.target >>> n_samples = len(X_digits) >>> X_train = X_digits[:.9 * n_samples] >>> y_train = y_digits[:.9 * n_samples] >>> X_test = X_digits[.9 * n_samples:] >>> y_test = y_digits[.9 * n_samples:] >>> svm = SVM(sparkSession, is_multi_class=True) >>> print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y_test))
-
class
systemml.mllearn.estimators.
NaiveBayes
(sparkSession, laplace=1.0, transferUsingDF=False)¶ Bases:
systemml.mllearn.estimators.BaseSystemMLClassifier
Performs Naive Bayes.
Examples
>>> from sklearn.datasets import fetch_20newsgroups >>> from sklearn.feature_extraction.text import TfidfVectorizer >>> from systemml.mllearn import NaiveBayes >>> from sklearn import metrics >>> from pyspark.sql import SparkSession >>> sparkSession = SparkSession.builder.getOrCreate(sc) >>> categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] >>> newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) >>> newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) >>> vectorizer = TfidfVectorizer() >>> # Both vectors and vectors_test are SciPy CSR matrix >>> vectors = vectorizer.fit_transform(newsgroups_train.data) >>> vectors_test = vectorizer.transform(newsgroups_test.data) >>> nb = NaiveBayes(sparkSession) >>> nb.fit(vectors, newsgroups_train.target) >>> pred = nb.predict(vectors_test) >>> metrics.f1_score(newsgroups_test.target, pred, average='weighted')
-
class
systemml.mllearn.estimators.
Caffe2DML
(sparkSession, solver, input_shape, transferUsingDF=False, tensorboard_log_dir=None)¶ Bases:
systemml.mllearn.estimators.BaseSystemMLClassifier
Performs training/prediction for a given caffe network.
Examples
>>> from systemml.mllearn import Caffe2DML >>> from mlxtend.data import mnist_data >>> import numpy as np >>> from sklearn.utils import shuffle >>> X, y = mnist_data() >>> X, y = shuffle(X, y) >>> imgShape = (1, 28, 28) >>> import urllib >>> urllib.urlretrieve('https://raw.githubusercontent.com/niketanpansare/model_zoo/master/caffe/vision/lenet/mnist/lenet.proto', 'lenet.proto') >>> urllib.urlretrieve('https://raw.githubusercontent.com/niketanpansare/model_zoo/master/caffe/vision/lenet/mnist/lenet_solver.proto', 'lenet_solver.proto') >>> caffe2DML = Caffe2DML(spark, 'lenet_solver.proto').set(max_iter=500) >>> caffe2DML.fit(X, y)
-
load
(weights=None, sep='/', ignore_weights=None, eager=False)¶ Load a pretrained model.
Parameters: - weights (directory whether learned weights are stored (default: None)) –
- sep (seperator to use (default: '/')) –
- ignore_weights (names of layers to not read from the weights directory (list of string, default:None)) –
- eager (load the model eagerly (default: False)) –
-
set
(debug=None, train_algo=None, test_algo=None, parallel_batches=None, output_activations=None)¶ Set input to Caffe2DML
Parameters: - debug (to add debugging DML code such as classification report, print DML script, etc (default: False)) –
- train_algo (can be minibatch, batch, allreduce_parallel_batches or allreduce (default: minibatch)) –
- test_algo (can be minibatch, batch, allreduce_parallel_batches or allreduce (default: minibatch)) –
- parallel_batches (number of parallel batches) –
- output_activations ((developer flag) directory to output activations of each layer as csv while prediction. To be used only in batch mode (default: None)) –
-
summary
()¶ Print the summary of the network
-
visualize
(layerName=None, varType='weight', aggFn='mean')¶ Use this to visualize the training procedure (requires validation_percentage to be non-zero). When one provides no argument to this method, we visualize training and validation loss.
Parameters: - layerName (Name of the layer in the Caffe prototype) –
- varType (should be either 'weight', 'bias', 'dweight', 'dbias', 'output' or 'doutput') –
- aggFn (should be either 'sum', 'mean', 'var' or 'sd') –
-
Module contents¶
SystemML Algorithms¶
Classification Algorithms | |
---|---|
LogisticRegression | Performs binomial and multinomial logistic regression |
SVM | Performs both binary-class and multi-class SVM |
NaiveBayes | Multinomial naive bayes classifier |
Regression Algorithms | |
---|---|
LinearRegression | Performs linear regression |
-
class
systemml.mllearn.
LinearRegression
(sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=1e-06, C=inf, solver='newton-cg', transferUsingDF=False)¶ Bases:
systemml.mllearn.estimators.BaseSystemMLRegressor
Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables.
Examples
>>> import numpy as np >>> from sklearn import datasets >>> from systemml.mllearn import LinearRegression >>> from pyspark.sql import SparkSession >>> # Load the diabetes dataset >>> diabetes = datasets.load_diabetes() >>> # Use only one feature >>> diabetes_X = diabetes.data[:, np.newaxis, 2] >>> # Split the data into training/testing sets >>> diabetes_X_train = diabetes_X[:-20] >>> diabetes_X_test = diabetes_X[-20:] >>> # Split the targets into training/testing sets >>> diabetes_y_train = diabetes.target[:-20] >>> diabetes_y_test = diabetes.target[-20:] >>> # Create linear regression object >>> regr = LinearRegression(sparkSession, solver='newton-cg') >>> # Train the model using the training sets >>> regr.fit(diabetes_X_train, diabetes_y_train) >>> # The mean square error >>> print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
-
class
systemml.mllearn.
LogisticRegression
(sparkSession, penalty='l2', fit_intercept=True, normalize=False, max_iter=100, max_inner_iter=0, tol=1e-06, C=1.0, solver='newton-cg', transferUsingDF=False)¶ Bases:
systemml.mllearn.estimators.BaseSystemMLClassifier
Performs both binomial and multinomial logistic regression.
Examples
Scikit-learn way
>>> from sklearn import datasets, neighbors >>> from systemml.mllearn import LogisticRegression >>> from pyspark.sql import SparkSession >>> sparkSession = SparkSession.builder.getOrCreate() >>> digits = datasets.load_digits() >>> X_digits = digits.data >>> y_digits = digits.target + 1 >>> n_samples = len(X_digits) >>> X_train = X_digits[:.9 * n_samples] >>> y_train = y_digits[:.9 * n_samples] >>> X_test = X_digits[.9 * n_samples:] >>> y_test = y_digits[.9 * n_samples:] >>> logistic = LogisticRegression(sparkSession) >>> print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
MLPipeline way
>>> from pyspark.ml import Pipeline >>> from systemml.mllearn import LogisticRegression >>> from pyspark.ml.feature import HashingTF, Tokenizer >>> from pyspark.sql import SparkSession >>> sparkSession = SparkSession.builder.getOrCreate() >>> training = sparkSession.createDataFrame([ >>> (0L, "a b c d e spark", 1.0), >>> (1L, "b d", 2.0), >>> (2L, "spark f g h", 1.0), >>> (3L, "hadoop mapreduce", 2.0), >>> (4L, "b spark who", 1.0), >>> (5L, "g d a y", 2.0), >>> (6L, "spark fly", 1.0), >>> (7L, "was mapreduce", 2.0), >>> (8L, "e spark program", 1.0), >>> (9L, "a e c l", 2.0), >>> (10L, "spark compile", 1.0), >>> (11L, "hadoop software", 2.0) >>> ], ["id", "text", "label"]) >>> tokenizer = Tokenizer(inputCol="text", outputCol="words") >>> hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) >>> lr = LogisticRegression(sparkSession) >>> pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) >>> model = pipeline.fit(training) >>> test = sparkSession.createDataFrame([ >>> (12L, "spark i j k"), >>> (13L, "l m n"), >>> (14L, "mapreduce spark"), >>> (15L, "apache hadoop")], ["id", "text"]) >>> prediction = model.transform(test) >>> prediction.show()
-
class
systemml.mllearn.
SVM
(sparkSession, fit_intercept=True, normalize=False, max_iter=100, tol=1e-06, C=1.0, is_multi_class=False, transferUsingDF=False)¶ Bases:
systemml.mllearn.estimators.BaseSystemMLClassifier
Performs both binary-class and multiclass SVM (Support Vector Machines).
Examples
>>> from sklearn import datasets, neighbors >>> from systemml.mllearn import SVM >>> from pyspark.sql import SparkSession >>> sparkSession = SparkSession.builder.getOrCreate() >>> digits = datasets.load_digits() >>> X_digits = digits.data >>> y_digits = digits.target >>> n_samples = len(X_digits) >>> X_train = X_digits[:.9 * n_samples] >>> y_train = y_digits[:.9 * n_samples] >>> X_test = X_digits[.9 * n_samples:] >>> y_test = y_digits[.9 * n_samples:] >>> svm = SVM(sparkSession, is_multi_class=True) >>> print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y_test))
-
class
systemml.mllearn.
NaiveBayes
(sparkSession, laplace=1.0, transferUsingDF=False)¶ Bases:
systemml.mllearn.estimators.BaseSystemMLClassifier
Performs Naive Bayes.
Examples
>>> from sklearn.datasets import fetch_20newsgroups >>> from sklearn.feature_extraction.text import TfidfVectorizer >>> from systemml.mllearn import NaiveBayes >>> from sklearn import metrics >>> from pyspark.sql import SparkSession >>> sparkSession = SparkSession.builder.getOrCreate(sc) >>> categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] >>> newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) >>> newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) >>> vectorizer = TfidfVectorizer() >>> # Both vectors and vectors_test are SciPy CSR matrix >>> vectors = vectorizer.fit_transform(newsgroups_train.data) >>> vectors_test = vectorizer.transform(newsgroups_test.data) >>> nb = NaiveBayes(sparkSession) >>> nb.fit(vectors, newsgroups_train.target) >>> pred = nb.predict(vectors_test) >>> metrics.f1_score(newsgroups_test.target, pred, average='weighted')
-
class
systemml.mllearn.
Caffe2DML
(sparkSession, solver, input_shape, transferUsingDF=False, tensorboard_log_dir=None)¶ Bases:
systemml.mllearn.estimators.BaseSystemMLClassifier
Performs training/prediction for a given caffe network.
Examples
>>> from systemml.mllearn import Caffe2DML >>> from mlxtend.data import mnist_data >>> import numpy as np >>> from sklearn.utils import shuffle >>> X, y = mnist_data() >>> X, y = shuffle(X, y) >>> imgShape = (1, 28, 28) >>> import urllib >>> urllib.urlretrieve('https://raw.githubusercontent.com/niketanpansare/model_zoo/master/caffe/vision/lenet/mnist/lenet.proto', 'lenet.proto') >>> urllib.urlretrieve('https://raw.githubusercontent.com/niketanpansare/model_zoo/master/caffe/vision/lenet/mnist/lenet_solver.proto', 'lenet_solver.proto') >>> caffe2DML = Caffe2DML(spark, 'lenet_solver.proto').set(max_iter=500) >>> caffe2DML.fit(X, y)
-
load
(weights=None, sep='/', ignore_weights=None, eager=False)¶ Load a pretrained model.
Parameters: - weights (directory whether learned weights are stored (default: None)) –
- sep (seperator to use (default: '/')) –
- ignore_weights (names of layers to not read from the weights directory (list of string, default:None)) –
- eager (load the model eagerly (default: False)) –
-
set
(debug=None, train_algo=None, test_algo=None, parallel_batches=None, output_activations=None)¶ Set input to Caffe2DML
Parameters: - debug (to add debugging DML code such as classification report, print DML script, etc (default: False)) –
- train_algo (can be minibatch, batch, allreduce_parallel_batches or allreduce (default: minibatch)) –
- test_algo (can be minibatch, batch, allreduce_parallel_batches or allreduce (default: minibatch)) –
- parallel_batches (number of parallel batches) –
- output_activations ((developer flag) directory to output activations of each layer as csv while prediction. To be used only in batch mode (default: None)) –
-
summary
()¶ Print the summary of the network
-
visualize
(layerName=None, varType='weight', aggFn='mean')¶ Use this to visualize the training procedure (requires validation_percentage to be non-zero). When one provides no argument to this method, we visualize training and validation loss.
Parameters: - layerName (Name of the layer in the Caffe prototype) –
- varType (should be either 'weight', 'bias', 'dweight', 'dbias', 'output' or 'doutput') –
- aggFn (should be either 'sum', 'mean', 'var' or 'sd') –
-