Apache SINGA
A distributed deep learning platform .
 All Classes Namespaces Files Functions Variables Typedefs Macros
data_shard.h
1 /************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21 
22 #ifndef SINGA_UTILS_DATA_SHARD_H_
23 #define SINGA_UTILS_DATA_SHARD_H_
24 
25 #include <google/protobuf/message.h>
26 #include <fstream>
27 #include <string>
28 #include <unordered_set>
29 
30 namespace singa {
31 
51 class DataShard {
52  public:
53  enum {
54  // read only mode used in training
55  kRead = 0,
56  // write mode used in creating shard (will overwrite previous one)
57  kCreate = 1,
58  // append mode, e.g. used when previous creating crashes
59  kAppend = 2
60  };
61 
70  DataShard(const std::string& folder, int mode);
71  DataShard(const std::string& folder, int mode, int capacity);
72  ~DataShard();
73 
82  bool Next(std::string* key, google::protobuf::Message* val);
91  bool Next(std::string* key, std::string* val);
99  bool Insert(const std::string& key, const google::protobuf::Message& tuple);
107  bool Insert(const std::string& key, const std::string& tuple);
112  void SeekToFirst();
117  void Flush();
123  int Count();
127  inline std::string path() { return path_; }
128 
129  protected:
136  int Next(std::string* key);
144  int PrepareForAppend(const std::string& path);
150  bool PrepareNextField(int size);
151 
152  private:
153  char mode_ = 0;
154  std::string path_ = "";
155  // either ifstream or ofstream
156  std::fstream fdat_;
157  // to avoid replicated record
158  std::unordered_set<std::string> keys_;
159  // internal buffer
160  char* buf_ = nullptr;
161  // offset inside the buf_
162  int offset_ = 0;
163  // allocated bytes for the buf_
164  int capacity_ = 0;
165  // bytes in buf_, used in reading
166  int bufsize_ = 0;
167 };
168 
169 } // namespace singa
170 
171 #endif // SINGA_UTILS_DATA_SHARD_H_
bool Next(std::string *key, google::protobuf::Message *val)
read next tuple from the shard.
bool Insert(const std::string &key, const google::protobuf::Message &tuple)
Append one tuple to the shard.
DataShard(const std::string &folder, int mode)
Init the shard obj.
int Count()
Iterate through all tuples to get the num of all tuples.
std::string path()
Definition: data_shard.h:127
Data shard stores training/validation/test tuples.
Definition: data_shard.h:51
int PrepareForAppend(const std::string &path)
Setup the disk pointer to the right position for append in case that the pervious write crashes...
bool PrepareNextField(int size)
Read data from disk if the current data in the buffer is not a full field.
void Flush()
Flush buffered data to disk.
void SeekToFirst()
Move the read pointer to the head of the shard file.