22 #ifndef SINGA_TRAINER_WORKER_H_
23 #define SINGA_TRAINER_WORKER_H_
24 #include "neuralnet/neuralnet.h"
25 #include "proto/job.pb.h"
26 #include "communication/socket.h"
44 static Worker* Create(
const JobProto& proto);
50 virtual void Init(
int thread_id,
int grp_id,
int id);
55 void Setup(
const JobProto& job, shared_ptr<NeuralNet> train_net,
56 shared_ptr<NeuralNet> valid_net, shared_ptr<NeuralNet> test_net);
90 void Checkpoint(
int step, shared_ptr<NeuralNet> net);
96 void Test(
int nsteps, Phase phase, shared_ptr<NeuralNet> net);
105 virtual void TestOneBatch(
int step, Phase phase, shared_ptr<NeuralNet> net,
145 int CollectAll(shared_ptr<NeuralNet> net,
int step);
150 bool data,
bool grad,
BridgeLayer* layer, shared_ptr<NeuralNet> net);
155 bool data,
bool grad,
BridgeLayer* layer, shared_ptr<NeuralNet> net);
168 inline bool StopNow(
int step)
const;
177 inline bool TestNow(
int step)
const;
192 int id()
const {
return id_;}
195 int thread_id_, grp_id_, id_;
198 shared_ptr<NeuralNet> train_net_, test_net_, validation_net_;
199 Dealer* layer_dealer_, *dealer_;
205 void Init(
int thread_id,
int grp_id,
int id)
override;
207 void TestOneBatch(
int step, Phase phase, shared_ptr<NeuralNet> net,
210 void Forward(
int step, Phase phase, shared_ptr<NeuralNet> net,
Metric* perf);
211 void Backward(
int step, shared_ptr<NeuralNet> net);
217 void TestOneBatch(
int step, Phase phase, shared_ptr<NeuralNet> net,
221 inline int BlobTrgt(
int grp,
int layer) {
222 return (grp << 16) | layer;
225 inline int BlobGrp(
int blob_trgt) {
226 return blob_trgt >> 16;
229 inline int BlobLayer(
int blob_trgt) {
230 static int mask = (1 << 16) -1;
231 return blob_trgt & mask;
235 #endif // SINGA_TRAINER_WORKER_H_
void TrainOneBatch(int step, Metric *perf) override
Train one mini-batch.
bool TestNow(int step) const
Check is it time to do test.
int Collect(Param *param, int step)
Block until the param is updated since sending the update request.
The Worker class which runs the training algorithm.
Definition: worker.h:42
void InitLocalParams()
Init all local params (i.e., params from layers resident in this worker).
Base paramter class.
Definition: param.h:93
int grp_id() const
Definition: worker.h:187
int CollectAll(shared_ptr< NeuralNet > net, int step)
Call Collect for every param of net.
bool DisplayDebugInfo(int step) const
Check is it time to display training info, e.g., loss and precison.
void ReceiveBlobs(bool data, bool grad, BridgeLayer *layer, shared_ptr< NeuralNet > net)
Receive blobs from other workers due to model partitions.
void TestOneBatch(int step, Phase phase, shared_ptr< NeuralNet > net, Metric *perf) override
Test/validate one mini-batch.
virtual void Init(int thread_id, int grp_id, int id)
bool DisplayNow(int step) const
Check is it time to display training info, e.g., loss and precison.
void Test(int nsteps, Phase phase, shared_ptr< NeuralNet > net)
Test the perforance of the learned model on validation or test dataset.
int id() const
worker ID within the worker group.
Definition: worker.h:192
void Init(int thread_id, int grp_id, int id) override
int Update(Param *param, int step)
Update Param.
int Put(Param *param, int step)
Put Param to server.
void Setup(const JobProto &job, shared_ptr< NeuralNet > train_net, shared_ptr< NeuralNet > valid_net, shared_ptr< NeuralNet > test_net)
Setup members.
void TrainOneBatch(int step, Metric *perf) override
Train one mini-batch.
Definition: connection_layer.h:33
virtual void TestOneBatch(int step, Phase phase, shared_ptr< NeuralNet > net, Metric *perf)=0
Test/validate one mini-batch.
void Run()
Main function of Worker.
void Checkpoint(int step, shared_ptr< NeuralNet > net)
Checkpoint all params owned by the worker from the first group onto disk.
int Get(Param *param, int step)
Get Param with specific version from server If the current version >= the requested version...
bool StopNow(int step) const
Check is it time to stop.
void Report(const string &prefix, const Metric &perf)
Report performance to the stub.
Performance mtrics.
Definition: common.h:85
void SendBlobs(bool data, bool grad, BridgeLayer *layer, shared_ptr< NeuralNet > net)
Send blobs to other workers due to model partitions.
virtual void TrainOneBatch(int step, Metric *perf)=0
Train one mini-batch.
const int kCollectSleepTime
< sleep 5 milliseconds if the Param is not updated to the expected version
Definition: worker.h:30
bool CheckpointNow(int step) const
Check is it time to do checkpoint.
void TestOneBatch(int step, Phase phase, shared_ptr< NeuralNet > net, Metric *perf) override
Test/validate one mini-batch.
bool ValidateNow(int step) const
Check is it time to do validation.