导出数据处理建议
数据格式
处理建议
streamingContext.fileStream[KeyClass, ValueClass, InputFormatClass](dataDirectory)groupId: com.databricksartifactId: spark-csv_2.10version: 1.4.0import java.io.{BufferedReader, File, FileInputStream, InputStreamReader}
import org.apache.commons.csv.{CSVFormat, CSVParser, QuoteMode}
import scala.collection.JavaConverters._
object Test extends App {
val file = new File("xxx")
val br = new BufferedReader(new InputStreamReader(new FileInputStream(file)))
val csvFileFormat = CSVFormat.DEFAULT.withEscape('\\').withQuote('"')
val csvParser = new CSVParser(br, csvFileFormat)
val records = csvParser.getRecords
for (record <- records.asScala) {
val sb = new StringBuilder()
val length = record.size()
(0 until length).foreach(i => {
sb.append(record.get(i))
sb.append(",")
})
println(sb.toString)
}
}导入到数据仓库示例
CREATE EXTERNAL TABLE TEST_EXPORT
(
sessionId STRING,
time BIGINT,
sendTime BIGINT,
pageTime BIGINT,
domain STRING,
page STRING,
queryParameters STRING,
eventName STRING,
eventNumber DOUBLE,
eventVariable map<string, string>,
loginUserId STRING
)
ROW FORMAT SERDE EATE EXTE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
STORED AS TEXTFILE
location '/tmp/test_export'
tblproperties ("skip.header.line.count"="1", "quote.delim"="\"", "escape.delim"="\\")md5进行文件完整性校验

Last updated
Was this helpful?