|
27 | 27 | import com.google.common.io.Closeables; |
28 | 28 | import java.io.Closeable; |
29 | 29 | import java.io.IOException; |
30 | | -import java.util.HashMap; |
31 | 30 | import java.util.List; |
32 | | -import java.util.Map; |
33 | 31 | import org.apache.avro.Schema; |
| 32 | +import org.apache.avro.SchemaParseException; |
34 | 33 | import org.apache.parquet.cli.BaseCommand; |
35 | 34 | import org.apache.parquet.cli.util.Expressions; |
| 35 | +import org.apache.parquet.example.data.Group; |
| 36 | +import org.apache.parquet.hadoop.ParquetReader; |
| 37 | +import org.apache.parquet.hadoop.example.GroupReadSupport; |
36 | 38 | import org.slf4j.Logger; |
37 | 39 |
|
38 | 40 | @Parameters(commandDescription = "Print the first N records from a file") |
@@ -60,41 +62,90 @@ public CatCommand(Logger console, long defaultNumRecords) { |
60 | 62 | public int run() throws IOException { |
61 | 63 | Preconditions.checkArgument(sourceFiles != null && !sourceFiles.isEmpty(), "Missing file name"); |
62 | 64 |
|
63 | | - // Ensure all source files have the columns specified first |
64 | | - Map<String, Schema> schemas = new HashMap<>(); |
65 | 65 | for (String source : sourceFiles) { |
66 | | - Schema schema = getAvroSchema(source); |
67 | | - schemas.put(source, Expressions.filterSchema(schema, columns)); |
| 66 | + try { |
| 67 | + runWithAvroSchema(source); |
| 68 | + } catch (SchemaParseException e) { |
| 69 | + console.debug( |
| 70 | + "Avro schema conversion failed for {}, falling back to Group reader: {}", |
| 71 | + source, |
| 72 | + e.getMessage()); |
| 73 | + runWithGroupReader(source); |
| 74 | + } |
68 | 75 | } |
69 | 76 |
|
70 | | - for (String source : sourceFiles) { |
71 | | - Schema projection = schemas.get(source); |
72 | | - Iterable<Object> reader = openDataFile(source, projection); |
73 | | - boolean threw = true; |
74 | | - long count = 0; |
75 | | - try { |
76 | | - for (Object record : reader) { |
77 | | - if (numRecords > 0 && count >= numRecords) { |
78 | | - break; |
79 | | - } |
80 | | - if (columns == null || columns.size() != 1) { |
81 | | - console.info(String.valueOf(record)); |
82 | | - } else { |
83 | | - console.info(String.valueOf(select(projection, record, columns.get(0)))); |
84 | | - } |
85 | | - count += 1; |
| 77 | + return 0; |
| 78 | + } |
| 79 | + |
| 80 | + private void runWithAvroSchema(String source) throws IOException { |
| 81 | + Schema schema = getAvroSchema(source); |
| 82 | + Schema projection = Expressions.filterSchema(schema, columns); |
| 83 | + |
| 84 | + Iterable<Object> reader = openDataFile(source, projection); |
| 85 | + boolean threw = true; |
| 86 | + long count = 0; |
| 87 | + try { |
| 88 | + for (Object record : reader) { |
| 89 | + if (numRecords > 0 && count >= numRecords) { |
| 90 | + break; |
86 | 91 | } |
87 | | - threw = false; |
88 | | - } catch (RuntimeException e) { |
89 | | - throw new RuntimeException("Failed on record " + count + " in file " + source, e); |
90 | | - } finally { |
91 | | - if (reader instanceof Closeable) { |
92 | | - Closeables.close((Closeable) reader, threw); |
| 92 | + if (columns == null || columns.size() != 1) { |
| 93 | + console.info(String.valueOf(record)); |
| 94 | + } else { |
| 95 | + console.info(String.valueOf(select(projection, record, columns.get(0)))); |
93 | 96 | } |
| 97 | + count += 1; |
| 98 | + } |
| 99 | + threw = false; |
| 100 | + } catch (RuntimeException e) { |
| 101 | + throw new RuntimeException("Failed on record " + count + " in file " + source, e); |
| 102 | + } finally { |
| 103 | + if (reader instanceof Closeable) { |
| 104 | + Closeables.close((Closeable) reader, threw); |
94 | 105 | } |
95 | 106 | } |
| 107 | + } |
96 | 108 |
|
97 | | - return 0; |
| 109 | + private void runWithGroupReader(String source) throws IOException { |
| 110 | + ParquetReader<Group> reader = ParquetReader.<Group>builder(new GroupReadSupport(), qualifiedPath(source)) |
| 111 | + .withConf(getConf()) |
| 112 | + .build(); |
| 113 | + |
| 114 | + boolean threw = true; |
| 115 | + long count = 0; |
| 116 | + try { |
| 117 | + for (Group record = reader.read(); record != null; record = reader.read()) { |
| 118 | + if (numRecords > 0 && count >= numRecords) { |
| 119 | + break; |
| 120 | + } |
| 121 | + |
| 122 | + if (columns == null) { |
| 123 | + console.info(record.toString()); |
| 124 | + } else { |
| 125 | + StringBuilder sb = new StringBuilder(); |
| 126 | + for (int i = 0; i < columns.size(); i++) { |
| 127 | + String columnName = columns.get(i); |
| 128 | + try { |
| 129 | + Object value = |
| 130 | + record.getValueToString(record.getType().getFieldIndex(columnName), 0); |
| 131 | + if (i > 0) sb.append(", "); |
| 132 | + sb.append(columnName).append(": ").append(value); |
| 133 | + } catch (Exception e) { |
| 134 | + console.warn("Column '{}' not found in file {}", columnName, source); |
| 135 | + } |
| 136 | + } |
| 137 | + if (sb.length() > 0) { |
| 138 | + console.info(sb.toString()); |
| 139 | + } |
| 140 | + } |
| 141 | + count += 1; |
| 142 | + } |
| 143 | + threw = false; |
| 144 | + } catch (RuntimeException e) { |
| 145 | + throw new RuntimeException("Failed on record " + count + " in file " + source, e); |
| 146 | + } finally { |
| 147 | + Closeables.close(reader, threw); |
| 148 | + } |
98 | 149 | } |
99 | 150 |
|
100 | 151 | @Override |
|
0 commit comments