content/api-v0.1.0/trainer_8h_source.html

 /************************************************************

 *

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *   http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 *

 *************************************************************/


 #ifndef INCLUDE_TRAINER_TRAINER_H_

 #define INCLUDE_TRAINER_TRAINER_H_

 #include <unordered_map>

 #include <queue>

 #include "proto/job.pb.h"

 #include "proto/singa.pb.h"

 #include "utils/param.h"

 #include "utils/singleton.h"

 #include "utils/factory.h"

 #include "neuralnet/neuralnet.h"

 #include "trainer/worker.h"

 #include "trainer/server.h"

 #include "communication/socket.h"


 namespace singa {

 class Trainer{

  public:

   ~Trainer();

   void Start(bool resume, const SingaProto& singaConf, JobProto* jobConf);


  protected:

   void Resume(JobProto* jobConf);

   vector<Server*> CreateServers(int nthread, const JobProto& jobConf);

   vector<Worker*> CreateWorkers(int nthread, const JobProto& jobConf);


   void SetupWorkerServer(

     const JobProto& jobConf,

     const vector<Worker*>& workers,

     const vector<Server*>& servers);


   void Run(const vector<Worker*>& workers, const vector<Server*>& servers);

   void DisplayMetric(Msg** msg);

   Dealer* CreateInterProcsDealer(int dst_procs);

   void HandleLocalMsg(std::queue<Msg*>* msg_queue, Msg** msg);


     const vector<Msg*> HandleGet(ParamEntry* entry, Msg** msg);

     void HandleGetResponse(ParamEntry* entry, Msg** msg);


     const vector<Msg*> HandleUpdate(ParamEntry* entry, Msg** msg);

   void HandleUpdateResponse(ParamEntry* entry, Msg** msg);


     const vector<Msg*> HandlePut(ParamEntry* entry, Msg** msg);


   void GenMsgs(int type, int version, ParamEntry* entry,

     Msg* msg, vector<Msg*> *ret);

   inline int Hash(int grp_id, int param_id) {

     return grp_id * 997 + param_id;

   }


  protected:

   int procs_id_;

   Router *router_;

   std::unordered_map<int, ParamEntry*> worker_shard_;

   vector<int> slice2server_;

 };

 } /* singa */

 #endif // INCLUDE_TRAINER_TRAINER_H_

singa::Trainer::Start
void Start(bool resume, const SingaProto &singaConf, JobProto *jobConf)
Entrance function which construct the workers and servers, and luanch one thread per worker/server...

singa::Trainer::HandlePut
const vector< Msg * > HandlePut(ParamEntry *entry, Msg **msg)
Generate a request message to Put the parameter object.

singa::Trainer::HandleUpdate
const vector< Msg * > HandleUpdate(ParamEntry *entry, Msg **msg)
Generate a request message to Update the parameter object.

singa::Msg
Msg used to transfer Param info (gradient or value), feature blob, etc between workers, stubs and servers.
Definition: msg.h:91

singa::Trainer::CreateWorkers
vector< Worker * > CreateWorkers(int nthread, const JobProto &jobConf)
Create workers instances.

singa::ParamEntry
ParamEntry is used for aggregating gradients of Params shared by workers from the same group...
Definition: param.h:335

singa::Trainer::HandleLocalMsg
void HandleLocalMsg(std::queue< Msg * > *msg_queue, Msg **msg)
Handle messages to local servers and local stub.

singa::Trainer::CreateInterProcsDealer
Dealer * CreateInterProcsDealer(int dst_procs)
Create a socket to send msg to the specified process.

singa::Dealer
Definition: socket.h:91

singa::Trainer::Resume
void Resume(JobProto *jobConf)
Setting the checkpoint field of model configuration to resume training.

singa::Trainer::worker_shard_
std::unordered_map< int, ParamEntry * > worker_shard_
map from slice to the server that updates it
Definition: trainer.h:158

singa::Trainer::Hash
int Hash(int grp_id, int param_id)
Get a hash id for a Param object from a group.
Definition: trainer.h:151

singa::Trainer::CreateServers
vector< Server * > CreateServers(int nthread, const JobProto &jobConf)
Create server instances.

singa::Router
Definition: socket.h:125

singa::Trainer::SetupWorkerServer
void SetupWorkerServer(const JobProto &jobConf, const vector< Worker * > &workers, const vector< Server * > &servers)
Setup workers and servers.

singa::Trainer
Every running process has a training object which launches one or more worker (and server) threads...
Definition: trainer.h:44

singa::Trainer::HandleGet
const vector< Msg * > HandleGet(ParamEntry *entry, Msg **msg)
Generate a request message to Get the parameter object.

singa::Trainer::DisplayMetric
void DisplayMetric(Msg **msg)
Display metrics to log (standard output)

singa::Trainer::GenMsgs
void GenMsgs(int type, int version, ParamEntry *entry, Msg *msg, vector< Msg * > *ret)
Called by HandlePut, HandleUpdate and HandleGet functions.