1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.io.hfile;
20
21 import java.io.ByteArrayOutputStream;
22 import java.io.DataInput;
23 import java.io.DataInputStream;
24 import java.io.DataOutput;
25 import java.io.DataOutputStream;
26 import java.io.IOException;
27 import java.nio.ByteBuffer;
28 import java.util.ArrayList;
29 import java.util.Collections;
30 import java.util.List;
31 import java.util.concurrent.atomic.AtomicReference;
32
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.apache.hadoop.conf.Configuration;
36 import org.apache.hadoop.fs.FSDataOutputStream;
37 import org.apache.hadoop.hbase.Cell;
38 import org.apache.hadoop.hbase.CellComparator;
39 import org.apache.hadoop.hbase.CellUtil;
40 import org.apache.hadoop.hbase.HConstants;
41 import org.apache.hadoop.hbase.KeyValue;
42 import org.apache.hadoop.hbase.KeyValue.KeyOnlyKeyValue;
43 import org.apache.hadoop.hbase.ByteBufferedKeyOnlyKeyValue;
44 import org.apache.hadoop.hbase.classification.InterfaceAudience;
45 import org.apache.hadoop.hbase.io.HeapSize;
46 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
47 import org.apache.hadoop.hbase.io.hfile.HFile.CachingBlockReader;
48 import org.apache.hadoop.hbase.nio.ByteBuff;
49 import org.apache.hadoop.hbase.util.Bytes;
50 import org.apache.hadoop.hbase.util.ClassSize;
51 import org.apache.hadoop.hbase.util.ObjectIntPair;
52 import org.apache.hadoop.io.WritableUtils;
53 import org.apache.hadoop.util.StringUtils;
54
55
56
57
58
59
60
61
62
63
64
65
66 @InterfaceAudience.Private
67 public class HFileBlockIndex {
68
69 private static final Log LOG = LogFactory.getLog(HFileBlockIndex.class);
70
71 static final int DEFAULT_MAX_CHUNK_SIZE = 128 * 1024;
72
73
74
75
76
77 public static final String MAX_CHUNK_SIZE_KEY = "hfile.index.block.max.size";
78
79
80
81
82
83
84
85 static final int SECONDARY_INDEX_ENTRY_OVERHEAD = Bytes.SIZEOF_INT
86 + Bytes.SIZEOF_LONG;
87
88
89
90
91 private static final String INLINE_BLOCKS_NOT_ALLOWED =
92 "Inline blocks are not allowed in the single-level-only mode";
93
94
95
96
97
98
99
100 private static final int MID_KEY_METADATA_SIZE = Bytes.SIZEOF_LONG +
101 2 * Bytes.SIZEOF_INT;
102
103
104
105
106
107
108 static class ByteArrayKeyBlockIndexReader extends BlockIndexReader {
109
110 private byte[][] blockKeys;
111
112 public ByteArrayKeyBlockIndexReader(final int treeLevel,
113 final CachingBlockReader cachingBlockReader) {
114 this(treeLevel);
115 this.cachingBlockReader = cachingBlockReader;
116 }
117
118 public ByteArrayKeyBlockIndexReader(final int treeLevel) {
119
120 searchTreeLevel = treeLevel;
121 }
122
123 protected long calculateHeapSizeForBlockKeys(long heapSize) {
124
125 if (blockKeys != null) {
126 heapSize += ClassSize.REFERENCE;
127
128 heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length * ClassSize.REFERENCE);
129
130
131 for (byte[] key : blockKeys) {
132 heapSize += ClassSize.align(ClassSize.ARRAY + key.length);
133 }
134 }
135 return heapSize;
136 }
137
138 @Override
139 public boolean isEmpty() {
140 return blockKeys.length == 0;
141 }
142
143
144
145
146
147 public byte[] getRootBlockKey(int i) {
148 return blockKeys[i];
149 }
150
151 @Override
152 public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
153 boolean cacheBlocks, boolean pread, boolean isCompaction,
154 DataBlockEncoding expectedDataBlockEncoding) throws IOException {
155
156 return null;
157 }
158
159 @Override
160 public Cell midkey() throws IOException {
161
162 return null;
163 }
164
165 protected void initialize(int numEntries) {
166 blockKeys = new byte[numEntries][];
167 }
168
169 protected void add(final byte[] key, final long offset, final int dataSize) {
170 blockOffsets[rootCount] = offset;
171 blockKeys[rootCount] = key;
172 blockDataSizes[rootCount] = dataSize;
173 rootCount++;
174 }
175
176 @Override
177 public int rootBlockContainingKey(byte[] key, int offset, int length, CellComparator comp) {
178 int pos = Bytes.binarySearch(blockKeys, key, offset, length);
179
180
181
182 if (pos >= 0) {
183
184 assert pos < blockKeys.length;
185 return pos;
186 }
187
188
189
190
191
192
193 int i = -pos - 1;
194 assert 0 <= i && i <= blockKeys.length;
195 return i - 1;
196 }
197
198 @Override
199 public int rootBlockContainingKey(Cell key) {
200
201 throw new UnsupportedOperationException(
202 "Cannot search for a key that is of Cell type. Only plain byte array keys " +
203 "can be searched for");
204 }
205
206 @Override
207 public String toString() {
208 StringBuilder sb = new StringBuilder();
209 sb.append("size=" + rootCount).append("\n");
210 for (int i = 0; i < rootCount; i++) {
211 sb.append("key=").append(KeyValue.keyToString(blockKeys[i]))
212 .append("\n offset=").append(blockOffsets[i])
213 .append(", dataSize=" + blockDataSizes[i]).append("\n");
214 }
215 return sb.toString();
216 }
217
218 }
219
220
221
222
223
224
225 static class CellBasedKeyBlockIndexReader extends BlockIndexReader {
226
227 private Cell[] blockKeys;
228
229 private AtomicReference<Cell> midKey = new AtomicReference<Cell>();
230
231 private CellComparator comparator;
232
233 public CellBasedKeyBlockIndexReader(final CellComparator c, final int treeLevel,
234 final CachingBlockReader cachingBlockReader) {
235 this(c, treeLevel);
236 this.cachingBlockReader = cachingBlockReader;
237 }
238
239 public CellBasedKeyBlockIndexReader(final CellComparator c, final int treeLevel) {
240
241 comparator = c;
242 searchTreeLevel = treeLevel;
243 }
244
245 protected long calculateHeapSizeForBlockKeys(long heapSize) {
246 if (blockKeys != null) {
247 heapSize += ClassSize.REFERENCE;
248
249 heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length * ClassSize.REFERENCE);
250
251
252 for (Cell key : blockKeys) {
253 heapSize += ClassSize.align(CellUtil.estimatedHeapSizeOf(key));
254 }
255 }
256
257 heapSize += 2 * ClassSize.REFERENCE;
258 return heapSize;
259 }
260
261 @Override
262 public boolean isEmpty() {
263 return blockKeys.length == 0;
264 }
265
266
267
268
269
270 public Cell getRootBlockKey(int i) {
271 return blockKeys[i];
272 }
273 @Override
274 public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
275 boolean cacheBlocks, boolean pread, boolean isCompaction,
276 DataBlockEncoding expectedDataBlockEncoding) throws IOException {
277 int rootLevelIndex = rootBlockContainingKey(key);
278 if (rootLevelIndex < 0 || rootLevelIndex >= blockOffsets.length) {
279 return null;
280 }
281
282
283 Cell nextIndexedKey = null;
284
285
286 long currentOffset = blockOffsets[rootLevelIndex];
287 int currentOnDiskSize = blockDataSizes[rootLevelIndex];
288
289 if (rootLevelIndex < blockKeys.length - 1) {
290 nextIndexedKey = blockKeys[rootLevelIndex + 1];
291 } else {
292 nextIndexedKey = HConstants.NO_NEXT_INDEXED_KEY;
293 }
294
295 int lookupLevel = 1;
296 int index = -1;
297
298 HFileBlock block = null;
299 boolean dataBlock = false;
300 KeyOnlyKeyValue tmpNextIndexKV = new KeyValue.KeyOnlyKeyValue();
301 while (true) {
302 try {
303 if (currentBlock != null && currentBlock.getOffset() == currentOffset) {
304
305
306
307
308 block = currentBlock;
309 } else {
310
311
312 boolean shouldCache = cacheBlocks || (lookupLevel < searchTreeLevel);
313 BlockType expectedBlockType;
314 if (lookupLevel < searchTreeLevel - 1) {
315 expectedBlockType = BlockType.INTERMEDIATE_INDEX;
316 } else if (lookupLevel == searchTreeLevel - 1) {
317 expectedBlockType = BlockType.LEAF_INDEX;
318 } else {
319
320 expectedBlockType = BlockType.DATA;
321 }
322 block =
323 cachingBlockReader.readBlock(currentOffset, currentOnDiskSize, shouldCache, pread,
324 isCompaction, true, expectedBlockType, expectedDataBlockEncoding);
325 }
326
327 if (block == null) {
328 throw new IOException("Failed to read block at offset " + currentOffset
329 + ", onDiskSize=" + currentOnDiskSize);
330 }
331
332
333 if (block.getBlockType().isData()) {
334 dataBlock = true;
335 break;
336 }
337
338
339
340 if (++lookupLevel > searchTreeLevel) {
341 throw new IOException("Search Tree Level overflow: lookupLevel=" + lookupLevel
342 + ", searchTreeLevel=" + searchTreeLevel);
343 }
344
345
346
347 ByteBuff buffer = block.getBufferWithoutHeader();
348 index = locateNonRootIndexEntry(buffer, key, comparator);
349 if (index == -1) {
350
351
352 throw new IOException("The key "
353 + CellUtil.getCellKeyAsString(key)
354 + " is before the" + " first key of the non-root index block " + block);
355 }
356
357 currentOffset = buffer.getLong();
358 currentOnDiskSize = buffer.getInt();
359
360
361 byte[] nonRootIndexedKey = getNonRootIndexedKey(buffer, index + 1);
362 if (nonRootIndexedKey != null) {
363 tmpNextIndexKV.setKey(nonRootIndexedKey, 0, nonRootIndexedKey.length);
364 nextIndexedKey = tmpNextIndexKV;
365 }
366 } finally {
367 if (!dataBlock) {
368
369
370 cachingBlockReader.returnBlock(block);
371 }
372 }
373 }
374
375 if (lookupLevel != searchTreeLevel) {
376 assert dataBlock == true;
377
378
379
380 cachingBlockReader.returnBlock(block);
381 throw new IOException("Reached a data block at level " + lookupLevel +
382 " but the number of levels is " + searchTreeLevel);
383 }
384
385
386 BlockWithScanInfo blockWithScanInfo = new BlockWithScanInfo(block, nextIndexedKey);
387 return blockWithScanInfo;
388 }
389
390 @Override
391 public Cell midkey() throws IOException {
392 if (rootCount == 0)
393 throw new IOException("HFile empty");
394
395 Cell targetMidKey = this.midKey.get();
396 if (targetMidKey != null) {
397 return targetMidKey;
398 }
399
400 if (midLeafBlockOffset >= 0) {
401 if (cachingBlockReader == null) {
402 throw new IOException("Have to read the middle leaf block but " +
403 "no block reader available");
404 }
405
406
407 HFileBlock midLeafBlock = cachingBlockReader.readBlock(
408 midLeafBlockOffset, midLeafBlockOnDiskSize, true, true, false, true,
409 BlockType.LEAF_INDEX, null);
410 try {
411 ByteBuff b = midLeafBlock.getBufferWithoutHeader();
412 int numDataBlocks = b.getIntAfterPosition(0);
413 int keyRelOffset = b.getIntAfterPosition(Bytes.SIZEOF_INT * (midKeyEntry + 1));
414 int keyLen = b.getIntAfterPosition(Bytes.SIZEOF_INT * (midKeyEntry + 2)) - keyRelOffset;
415 int keyOffset =
416 Bytes.SIZEOF_INT * (numDataBlocks + 2) + keyRelOffset
417 + SECONDARY_INDEX_ENTRY_OVERHEAD;
418 byte[] bytes = b.toBytes(keyOffset, keyLen);
419 targetMidKey = new KeyValue.KeyOnlyKeyValue(bytes, 0, bytes.length);
420 } finally {
421 cachingBlockReader.returnBlock(midLeafBlock);
422 }
423 } else {
424
425 targetMidKey = blockKeys[rootCount / 2];
426 }
427
428 this.midKey.set(targetMidKey);
429 return targetMidKey;
430 }
431
432 protected void initialize(int numEntries) {
433 blockKeys = new Cell[numEntries];
434 }
435
436
437
438
439
440
441
442
443 protected void add(final byte[] key, final long offset, final int dataSize) {
444 blockOffsets[rootCount] = offset;
445
446 blockKeys[rootCount] = new KeyValue.KeyOnlyKeyValue(key, 0, key.length);
447 blockDataSizes[rootCount] = dataSize;
448 rootCount++;
449 }
450
451 @Override
452 public int rootBlockContainingKey(final byte[] key, int offset, int length,
453 CellComparator comp) {
454
455 throw new UnsupportedOperationException("Cannot find for a key containing plain byte " +
456 "array. Only cell based keys can be searched for");
457 }
458
459 @Override
460 public int rootBlockContainingKey(Cell key) {
461
462 int pos = Bytes.binarySearch(blockKeys, key, comparator);
463
464
465
466 if (pos >= 0) {
467
468 assert pos < blockKeys.length;
469 return pos;
470 }
471
472
473
474
475
476
477 int i = -pos - 1;
478 assert 0 <= i && i <= blockKeys.length;
479 return i - 1;
480 }
481
482 @Override
483 public String toString() {
484 StringBuilder sb = new StringBuilder();
485 sb.append("size=" + rootCount).append("\n");
486 for (int i = 0; i < rootCount; i++) {
487 sb.append("key=").append((blockKeys[i]))
488 .append("\n offset=").append(blockOffsets[i])
489 .append(", dataSize=" + blockDataSizes[i]).append("\n");
490 }
491 return sb.toString();
492 }
493 }
494
495
496
497
498
499
500
501
502
503
504 static abstract class BlockIndexReader implements HeapSize {
505
506 protected long[] blockOffsets;
507 protected int[] blockDataSizes;
508 protected int rootCount = 0;
509
510
511 protected long midLeafBlockOffset = -1;
512 protected int midLeafBlockOnDiskSize = -1;
513 protected int midKeyEntry = -1;
514
515
516
517
518
519 protected int searchTreeLevel;
520
521
522 protected CachingBlockReader cachingBlockReader;
523
524
525
526
527 public abstract boolean isEmpty();
528
529
530
531
532
533 public void ensureNonEmpty() {
534 if (isEmpty()) {
535 throw new IllegalStateException("Block index is empty or not loaded");
536 }
537 }
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554 public HFileBlock seekToDataBlock(final Cell key, HFileBlock currentBlock, boolean cacheBlocks,
555 boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding)
556 throws IOException {
557 BlockWithScanInfo blockWithScanInfo = loadDataBlockWithScanInfo(key, currentBlock,
558 cacheBlocks,
559 pread, isCompaction, expectedDataBlockEncoding);
560 if (blockWithScanInfo == null) {
561 return null;
562 } else {
563 return blockWithScanInfo.getHFileBlock();
564 }
565 }
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586 public abstract BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
587 boolean cacheBlocks,
588 boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding)
589 throws IOException;
590
591
592
593
594
595
596
597
598 public abstract Cell midkey() throws IOException;
599
600
601
602
603 public long getRootBlockOffset(int i) {
604 return blockOffsets[i];
605 }
606
607
608
609
610
611
612 public int getRootBlockDataSize(int i) {
613 return blockDataSizes[i];
614 }
615
616
617
618
619 public int getRootBlockCount() {
620 return rootCount;
621 }
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637 public abstract int rootBlockContainingKey(final byte[] key, int offset, int length,
638 CellComparator comp);
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653 public int rootBlockContainingKey(final byte[] key, int offset, int length) {
654 return rootBlockContainingKey(key, offset, length, null);
655 }
656
657
658
659
660
661
662
663 public abstract int rootBlockContainingKey(final Cell key);
664
665
666
667
668
669
670
671 protected byte[] getNonRootIndexedKey(ByteBuff nonRootIndex, int i) {
672 int numEntries = nonRootIndex.getInt(0);
673 if (i < 0 || i >= numEntries) {
674 return null;
675 }
676
677
678
679 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
680
681 int targetKeyRelOffset = nonRootIndex.getInt(
682 Bytes.SIZEOF_INT * (i + 1));
683
684
685 int targetKeyOffset = entriesOffset
686 + targetKeyRelOffset
687 + SECONDARY_INDEX_ENTRY_OVERHEAD;
688
689
690
691
692 int targetKeyLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 2)) -
693 targetKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;
694
695
696 return nonRootIndex.toBytes(targetKeyOffset, targetKeyLength);
697 }
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715 static int binarySearchNonRootIndex(Cell key, ByteBuff nonRootIndex,
716 CellComparator comparator) {
717
718 int numEntries = nonRootIndex.getIntAfterPosition(0);
719 int low = 0;
720 int high = numEntries - 1;
721 int mid = 0;
722
723
724
725 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
726
727
728
729
730 ByteBufferedKeyOnlyKeyValue nonRootIndexkeyOnlyKV = new ByteBufferedKeyOnlyKeyValue();
731 ObjectIntPair<ByteBuffer> pair = new ObjectIntPair<ByteBuffer>();
732 while (low <= high) {
733 mid = (low + high) >>> 1;
734
735
736 int midKeyRelOffset = nonRootIndex.getIntAfterPosition(Bytes.SIZEOF_INT * (mid + 1));
737
738
739 int midKeyOffset = entriesOffset
740 + midKeyRelOffset
741 + SECONDARY_INDEX_ENTRY_OVERHEAD;
742
743
744
745
746 int midLength = nonRootIndex.getIntAfterPosition(Bytes.SIZEOF_INT * (mid + 2)) -
747 midKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;
748
749
750
751
752
753
754 nonRootIndex.asSubByteBuffer(midKeyOffset, midLength, pair);
755 nonRootIndexkeyOnlyKV.setKey(pair.getFirst(), pair.getSecond(), midLength);
756 int cmp = comparator.compareKeyIgnoresMvcc(key, nonRootIndexkeyOnlyKV);
757
758
759 if (cmp > 0)
760 low = mid + 1;
761
762 else if (cmp < 0)
763 high = mid - 1;
764 else
765 return mid;
766 }
767
768
769
770
771
772 if (low != high + 1) {
773 throw new IllegalStateException("Binary search broken: low=" + low
774 + " " + "instead of " + (high + 1));
775 }
776
777
778
779 int i = low - 1;
780
781
782 if (i < -1 || i >= numEntries) {
783 throw new IllegalStateException("Binary search broken: result is " +
784 i + " but expected to be between -1 and (numEntries - 1) = " +
785 (numEntries - 1));
786 }
787
788 return i;
789 }
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805 static int locateNonRootIndexEntry(ByteBuff nonRootBlock, Cell key,
806 CellComparator comparator) {
807 int entryIndex = binarySearchNonRootIndex(key, nonRootBlock, comparator);
808
809 if (entryIndex != -1) {
810 int numEntries = nonRootBlock.getIntAfterPosition(0);
811
812
813 int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
814
815
816
817 int entryRelOffset = nonRootBlock
818 .getIntAfterPosition(Bytes.SIZEOF_INT * (1 + entryIndex));
819
820 nonRootBlock.position(entriesOffset + entryRelOffset);
821 }
822
823 return entryIndex;
824 }
825
826
827
828
829
830
831
832
833
834
835
836 public void readRootIndex(DataInput in, final int numEntries) throws IOException {
837 blockOffsets = new long[numEntries];
838 initialize(numEntries);
839 blockDataSizes = new int[numEntries];
840
841
842 if (numEntries > 0) {
843 for (int i = 0; i < numEntries; ++i) {
844 long offset = in.readLong();
845 int dataSize = in.readInt();
846 byte[] key = Bytes.readByteArray(in);
847 add(key, offset, dataSize);
848 }
849 }
850 }
851
852 protected abstract void initialize(int numEntries);
853
854 protected abstract void add(final byte[] key, final long offset, final int dataSize);
855
856
857
858
859
860
861
862
863
864
865
866
867 public DataInputStream readRootIndex(HFileBlock blk, final int numEntries) throws IOException {
868 DataInputStream in = blk.getByteStream();
869 readRootIndex(in, numEntries);
870 return in;
871 }
872
873
874
875
876
877
878
879
880
881
882 public void readMultiLevelIndexRoot(HFileBlock blk,
883 final int numEntries) throws IOException {
884 DataInputStream in = readRootIndex(blk, numEntries);
885
886
887 int checkSumBytes = blk.totalChecksumBytes();
888 if ((in.available() - checkSumBytes) < MID_KEY_METADATA_SIZE) {
889
890 return;
891 }
892 midLeafBlockOffset = in.readLong();
893 midLeafBlockOnDiskSize = in.readInt();
894 midKeyEntry = in.readInt();
895 }
896
897 @Override
898 public long heapSize() {
899
900 long heapSize = ClassSize.align(3 * ClassSize.REFERENCE +
901 2 * Bytes.SIZEOF_INT + ClassSize.OBJECT);
902
903
904 heapSize += MID_KEY_METADATA_SIZE;
905
906 heapSize = calculateHeapSizeForBlockKeys(heapSize);
907
908 if (blockOffsets != null) {
909 heapSize += ClassSize.align(ClassSize.ARRAY + blockOffsets.length
910 * Bytes.SIZEOF_LONG);
911 }
912
913 if (blockDataSizes != null) {
914 heapSize += ClassSize.align(ClassSize.ARRAY + blockDataSizes.length
915 * Bytes.SIZEOF_INT);
916 }
917
918 return ClassSize.align(heapSize);
919 }
920
921 protected abstract long calculateHeapSizeForBlockKeys(long heapSize);
922 }
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939 public static class BlockIndexWriter implements InlineBlockWriter {
940
941
942
943
944
945
946
947
948
949 private BlockIndexChunk rootChunk = new BlockIndexChunk();
950
951
952
953
954
955 private BlockIndexChunk curInlineChunk = new BlockIndexChunk();
956
957
958
959
960
961
962
963
964
965
966 private int numLevels = 1;
967
968 private HFileBlock.Writer blockWriter;
969 private byte[] firstKey = null;
970
971
972
973
974
975
976 private long totalNumEntries;
977
978
979 private long totalBlockOnDiskSize;
980
981
982 private long totalBlockUncompressedSize;
983
984
985 private int maxChunkSize;
986
987
988 private boolean singleLevelOnly;
989
990
991 private CacheConfig cacheConf;
992
993
994 private String nameForCaching;
995
996
997 public BlockIndexWriter() {
998 this(null, null, null);
999 singleLevelOnly = true;
1000 }
1001
1002
1003
1004
1005
1006
1007
1008 public BlockIndexWriter(HFileBlock.Writer blockWriter,
1009 CacheConfig cacheConf, String nameForCaching) {
1010 if ((cacheConf == null) != (nameForCaching == null)) {
1011 throw new IllegalArgumentException("Block cache and file name for " +
1012 "caching must be both specified or both null");
1013 }
1014
1015 this.blockWriter = blockWriter;
1016 this.cacheConf = cacheConf;
1017 this.nameForCaching = nameForCaching;
1018 this.maxChunkSize = HFileBlockIndex.DEFAULT_MAX_CHUNK_SIZE;
1019 }
1020
1021 public void setMaxChunkSize(int maxChunkSize) {
1022 if (maxChunkSize <= 0) {
1023 throw new IllegalArgumentException("Invald maximum index block size");
1024 }
1025 this.maxChunkSize = maxChunkSize;
1026 }
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046 public long writeIndexBlocks(FSDataOutputStream out) throws IOException {
1047 if (curInlineChunk != null && curInlineChunk.getNumEntries() != 0) {
1048 throw new IOException("Trying to write a multi-level block index, " +
1049 "but are " + curInlineChunk.getNumEntries() + " entries in the " +
1050 "last inline chunk.");
1051 }
1052
1053
1054
1055 byte[] midKeyMetadata = numLevels > 1 ? rootChunk.getMidKeyMetadata()
1056 : null;
1057
1058 if (curInlineChunk != null) {
1059 while (rootChunk.getRootSize() > maxChunkSize) {
1060 rootChunk = writeIntermediateLevel(out, rootChunk);
1061 numLevels += 1;
1062 }
1063 }
1064
1065
1066 long rootLevelIndexPos = out.getPos();
1067
1068 {
1069 DataOutput blockStream =
1070 blockWriter.startWriting(BlockType.ROOT_INDEX);
1071 rootChunk.writeRoot(blockStream);
1072 if (midKeyMetadata != null)
1073 blockStream.write(midKeyMetadata);
1074 blockWriter.writeHeaderAndData(out);
1075 }
1076
1077
1078 totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
1079 totalBlockUncompressedSize +=
1080 blockWriter.getUncompressedSizeWithoutHeader();
1081
1082 if (LOG.isTraceEnabled()) {
1083 LOG.trace("Wrote a " + numLevels + "-level index with root level at pos "
1084 + rootLevelIndexPos + ", " + rootChunk.getNumEntries()
1085 + " root-level entries, " + totalNumEntries + " total entries, "
1086 + StringUtils.humanReadableInt(this.totalBlockOnDiskSize) +
1087 " on-disk size, "
1088 + StringUtils.humanReadableInt(totalBlockUncompressedSize) +
1089 " total uncompressed size.");
1090 }
1091 return rootLevelIndexPos;
1092 }
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104 public void writeSingleLevelIndex(DataOutput out, String description)
1105 throws IOException {
1106 expectNumLevels(1);
1107
1108 if (!singleLevelOnly)
1109 throw new IOException("Single-level mode is turned off");
1110
1111 if (rootChunk.getNumEntries() > 0)
1112 throw new IOException("Root-level entries already added in " +
1113 "single-level mode");
1114
1115 rootChunk = curInlineChunk;
1116 curInlineChunk = new BlockIndexChunk();
1117
1118 if (LOG.isTraceEnabled()) {
1119 LOG.trace("Wrote a single-level " + description + " index with "
1120 + rootChunk.getNumEntries() + " entries, " + rootChunk.getRootSize()
1121 + " bytes");
1122 }
1123 rootChunk.writeRoot(out);
1124 }
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138 private BlockIndexChunk writeIntermediateLevel(FSDataOutputStream out,
1139 BlockIndexChunk currentLevel) throws IOException {
1140
1141 BlockIndexChunk parent = new BlockIndexChunk();
1142
1143
1144 BlockIndexChunk curChunk = new BlockIndexChunk();
1145
1146 for (int i = 0; i < currentLevel.getNumEntries(); ++i) {
1147 curChunk.add(currentLevel.getBlockKey(i),
1148 currentLevel.getBlockOffset(i), currentLevel.getOnDiskDataSize(i));
1149
1150 if (curChunk.getRootSize() >= maxChunkSize)
1151 writeIntermediateBlock(out, parent, curChunk);
1152 }
1153
1154 if (curChunk.getNumEntries() > 0) {
1155 writeIntermediateBlock(out, parent, curChunk);
1156 }
1157
1158 return parent;
1159 }
1160
1161 private void writeIntermediateBlock(FSDataOutputStream out,
1162 BlockIndexChunk parent, BlockIndexChunk curChunk) throws IOException {
1163 long beginOffset = out.getPos();
1164 DataOutputStream dos = blockWriter.startWriting(
1165 BlockType.INTERMEDIATE_INDEX);
1166 curChunk.writeNonRoot(dos);
1167 byte[] curFirstKey = curChunk.getBlockKey(0);
1168 blockWriter.writeHeaderAndData(out);
1169
1170 if (cacheConf != null) {
1171 HFileBlock blockForCaching = blockWriter.getBlockForCaching(cacheConf);
1172 cacheConf.getBlockCache().cacheBlock(new BlockCacheKey(nameForCaching,
1173 beginOffset), blockForCaching);
1174 }
1175
1176
1177 totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
1178 totalBlockUncompressedSize +=
1179 blockWriter.getUncompressedSizeWithoutHeader();
1180
1181
1182
1183
1184
1185
1186 parent.add(curFirstKey, beginOffset,
1187 blockWriter.getOnDiskSizeWithHeader());
1188
1189
1190 curChunk.clear();
1191 curFirstKey = null;
1192 }
1193
1194
1195
1196
1197 public final int getNumRootEntries() {
1198 return rootChunk.getNumEntries();
1199 }
1200
1201
1202
1203
1204 public int getNumLevels() {
1205 return numLevels;
1206 }
1207
1208 private void expectNumLevels(int expectedNumLevels) {
1209 if (numLevels != expectedNumLevels) {
1210 throw new IllegalStateException("Number of block index levels is "
1211 + numLevels + "but is expected to be " + expectedNumLevels);
1212 }
1213 }
1214
1215
1216
1217
1218
1219
1220 @Override
1221 public boolean shouldWriteBlock(boolean closing) {
1222 if (singleLevelOnly) {
1223 throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
1224 }
1225
1226 if (curInlineChunk == null) {
1227 throw new IllegalStateException("curInlineChunk is null; has shouldWriteBlock been " +
1228 "called with closing=true and then called again?");
1229 }
1230
1231 if (curInlineChunk.getNumEntries() == 0) {
1232 return false;
1233 }
1234
1235
1236 if (closing) {
1237 if (rootChunk.getNumEntries() == 0) {
1238
1239
1240
1241 expectNumLevels(1);
1242 rootChunk = curInlineChunk;
1243 curInlineChunk = null;
1244 return false;
1245 }
1246
1247 return true;
1248 } else {
1249 return curInlineChunk.getNonRootSize() >= maxChunkSize;
1250 }
1251 }
1252
1253
1254
1255
1256
1257
1258
1259 @Override
1260 public void writeInlineBlock(DataOutput out) throws IOException {
1261 if (singleLevelOnly)
1262 throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
1263
1264
1265
1266 curInlineChunk.writeNonRoot(out);
1267
1268
1269
1270 firstKey = curInlineChunk.getBlockKey(0);
1271
1272
1273 curInlineChunk.clear();
1274 }
1275
1276
1277
1278
1279
1280 @Override
1281 public void blockWritten(long offset, int onDiskSize, int uncompressedSize) {
1282
1283 totalBlockOnDiskSize += onDiskSize;
1284 totalBlockUncompressedSize += uncompressedSize;
1285
1286 if (singleLevelOnly)
1287 throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
1288
1289 if (firstKey == null) {
1290 throw new IllegalStateException("Trying to add second-level index " +
1291 "entry with offset=" + offset + " and onDiskSize=" + onDiskSize +
1292 "but the first key was not set in writeInlineBlock");
1293 }
1294
1295 if (rootChunk.getNumEntries() == 0) {
1296
1297 expectNumLevels(1);
1298 numLevels = 2;
1299 }
1300
1301
1302
1303 rootChunk.add(firstKey, offset, onDiskSize, totalNumEntries);
1304 firstKey = null;
1305 }
1306
1307 @Override
1308 public BlockType getInlineBlockType() {
1309 return BlockType.LEAF_INDEX;
1310 }
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322 public void addEntry(byte[] firstKey, long blockOffset, int blockDataSize) {
1323 curInlineChunk.add(firstKey, blockOffset, blockDataSize);
1324 ++totalNumEntries;
1325 }
1326
1327
1328
1329
1330 public void ensureSingleLevel() throws IOException {
1331 if (numLevels > 1) {
1332 throw new IOException ("Wrote a " + numLevels + "-level index with " +
1333 rootChunk.getNumEntries() + " root-level entries, but " +
1334 "this is expected to be a single-level block index.");
1335 }
1336 }
1337
1338
1339
1340
1341
1342
1343 @Override
1344 public boolean getCacheOnWrite() {
1345 return cacheConf != null && cacheConf.shouldCacheIndexesOnWrite();
1346 }
1347
1348
1349
1350
1351
1352
1353
1354 public long getTotalUncompressedSize() {
1355 return totalBlockUncompressedSize;
1356 }
1357
1358 }
1359
1360
1361
1362
1363
1364
1365 static class BlockIndexChunk {
1366
1367
1368 private final List<byte[]> blockKeys = new ArrayList<byte[]>();
1369
1370
1371 private final List<Long> blockOffsets = new ArrayList<Long>();
1372
1373
1374 private final List<Integer> onDiskDataSizes = new ArrayList<Integer>();
1375
1376
1377
1378
1379
1380
1381 private final List<Long> numSubEntriesAt = new ArrayList<Long>();
1382
1383
1384
1385
1386
1387
1388 private int curTotalNonRootEntrySize = 0;
1389
1390
1391
1392
1393 private int curTotalRootSize = 0;
1394
1395
1396
1397
1398
1399
1400 private final List<Integer> secondaryIndexOffsetMarks =
1401 new ArrayList<Integer>();
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416 void add(byte[] firstKey, long blockOffset, int onDiskDataSize,
1417 long curTotalNumSubEntries) {
1418
1419 secondaryIndexOffsetMarks.add(curTotalNonRootEntrySize);
1420 curTotalNonRootEntrySize += SECONDARY_INDEX_ENTRY_OVERHEAD
1421 + firstKey.length;
1422
1423 curTotalRootSize += Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT
1424 + WritableUtils.getVIntSize(firstKey.length) + firstKey.length;
1425
1426 blockKeys.add(firstKey);
1427 blockOffsets.add(blockOffset);
1428 onDiskDataSizes.add(onDiskDataSize);
1429
1430 if (curTotalNumSubEntries != -1) {
1431 numSubEntriesAt.add(curTotalNumSubEntries);
1432
1433
1434 if (numSubEntriesAt.size() != blockKeys.size()) {
1435 throw new IllegalStateException("Only have key/value count " +
1436 "stats for " + numSubEntriesAt.size() + " block index " +
1437 "entries out of " + blockKeys.size());
1438 }
1439 }
1440 }
1441
1442
1443
1444
1445
1446
1447
1448 public void add(byte[] firstKey, long blockOffset, int onDiskDataSize) {
1449 add(firstKey, blockOffset, onDiskDataSize, -1);
1450 }
1451
1452 public void clear() {
1453 blockKeys.clear();
1454 blockOffsets.clear();
1455 onDiskDataSizes.clear();
1456 secondaryIndexOffsetMarks.clear();
1457 numSubEntriesAt.clear();
1458 curTotalNonRootEntrySize = 0;
1459 curTotalRootSize = 0;
1460 }
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479 public int getEntryBySubEntry(long k) {
1480
1481
1482
1483 int i = Collections.binarySearch(numSubEntriesAt, k);
1484
1485
1486
1487
1488 if (i >= 0)
1489 return i + 1;
1490
1491
1492 return -i - 1;
1493 }
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503 public byte[] getMidKeyMetadata() throws IOException {
1504 ByteArrayOutputStream baos = new ByteArrayOutputStream(
1505 MID_KEY_METADATA_SIZE);
1506 DataOutputStream baosDos = new DataOutputStream(baos);
1507 long totalNumSubEntries = numSubEntriesAt.get(blockKeys.size() - 1);
1508 if (totalNumSubEntries == 0) {
1509 throw new IOException("No leaf-level entries, mid-key unavailable");
1510 }
1511 long midKeySubEntry = (totalNumSubEntries - 1) / 2;
1512 int midKeyEntry = getEntryBySubEntry(midKeySubEntry);
1513
1514 baosDos.writeLong(blockOffsets.get(midKeyEntry));
1515 baosDos.writeInt(onDiskDataSizes.get(midKeyEntry));
1516
1517 long numSubEntriesBefore = midKeyEntry > 0
1518 ? numSubEntriesAt.get(midKeyEntry - 1) : 0;
1519 long subEntryWithinEntry = midKeySubEntry - numSubEntriesBefore;
1520 if (subEntryWithinEntry < 0 || subEntryWithinEntry > Integer.MAX_VALUE)
1521 {
1522 throw new IOException("Could not identify mid-key index within the "
1523 + "leaf-level block containing mid-key: out of range ("
1524 + subEntryWithinEntry + ", numSubEntriesBefore="
1525 + numSubEntriesBefore + ", midKeySubEntry=" + midKeySubEntry
1526 + ")");
1527 }
1528
1529 baosDos.writeInt((int) subEntryWithinEntry);
1530
1531 if (baosDos.size() != MID_KEY_METADATA_SIZE) {
1532 throw new IOException("Could not write mid-key metadata: size=" +
1533 baosDos.size() + ", correct size: " + MID_KEY_METADATA_SIZE);
1534 }
1535
1536
1537 baos.close();
1538
1539 return baos.toByteArray();
1540 }
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551 void writeNonRoot(DataOutput out) throws IOException {
1552
1553 out.writeInt(blockKeys.size());
1554
1555 if (secondaryIndexOffsetMarks.size() != blockKeys.size()) {
1556 throw new IOException("Corrupted block index chunk writer: " +
1557 blockKeys.size() + " entries but " +
1558 secondaryIndexOffsetMarks.size() + " secondary index items");
1559 }
1560
1561
1562
1563
1564
1565 for (int currentSecondaryIndex : secondaryIndexOffsetMarks)
1566 out.writeInt(currentSecondaryIndex);
1567
1568
1569
1570 out.writeInt(curTotalNonRootEntrySize);
1571
1572 for (int i = 0; i < blockKeys.size(); ++i) {
1573 out.writeLong(blockOffsets.get(i));
1574 out.writeInt(onDiskDataSizes.get(i));
1575 out.write(blockKeys.get(i));
1576 }
1577 }
1578
1579
1580
1581
1582
1583 int getNonRootSize() {
1584 return Bytes.SIZEOF_INT
1585 + Bytes.SIZEOF_INT * (blockKeys.size() + 1)
1586 + curTotalNonRootEntrySize;
1587 }
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599 void writeRoot(DataOutput out) throws IOException {
1600 for (int i = 0; i < blockKeys.size(); ++i) {
1601 out.writeLong(blockOffsets.get(i));
1602 out.writeInt(onDiskDataSizes.get(i));
1603 Bytes.writeByteArray(out, blockKeys.get(i));
1604 }
1605 }
1606
1607
1608
1609
1610 int getRootSize() {
1611 return curTotalRootSize;
1612 }
1613
1614
1615
1616
1617 public int getNumEntries() {
1618 return blockKeys.size();
1619 }
1620
1621 public byte[] getBlockKey(int i) {
1622 return blockKeys.get(i);
1623 }
1624
1625 public long getBlockOffset(int i) {
1626 return blockOffsets.get(i);
1627 }
1628
1629 public int getOnDiskDataSize(int i) {
1630 return onDiskDataSizes.get(i);
1631 }
1632
1633 public long getCumulativeNumKV(int i) {
1634 if (i < 0)
1635 return 0;
1636 return numSubEntriesAt.get(i);
1637 }
1638
1639 }
1640
1641 public static int getMaxChunkSize(Configuration conf) {
1642 return conf.getInt(MAX_CHUNK_SIZE_KEY, DEFAULT_MAX_CHUNK_SIZE);
1643 }
1644 }