基本使用 · Hadoop2.x

hadoop虽然安装在Linux上，但是在Windows上写代码时也需要配置hadoop环境。 [TOC] # 1. windows环境搭建 1. 官网下载与Linux中一致的hadoop安装包 https://archive.apache.org/dist/hadoop/common/hadoop-2.6.0/ ![](https://img.kancloud.cn/4d/05/4d05267dafd00df04cc8e9a1502c7463_1040x256.png) Windows和Linux使用的是同一个.tar.gz文件。 2. 将安装包解压到D盘或其他盘符下 ![](https://img.kancloud.cn/89/75/897519331713ae9c760ff3ad33299f98_1145x38.png) 3. 添加 hadoop.dll 和 winutils.exe 到 D:\hadoop-2.6.0-cdh5.14.2\bin 目录下（去网上找） 4. 添加hadoop到Windows的环境变量中 ![](https://img.kancloud.cn/5b/1f/5b1f39fcad70c06ff8644b324ee5fa52_841x219.png) ![](https://img.kancloud.cn/ad/48/ad486f29d72d09fc30df5c365044ba1b_1219x347.png) <br/> # 2. Java API 1. 使用IDEA创建一个Maven工程 ![](https://img.kancloud.cn/ce/73/ce73b4a55efbe6cd9efbec22fd7e964d_1055x464.png) 2. 添加依赖 *`pom.xml`* ```xml  <repositories> <repository> <id>cloudera</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> </repository> </repositories> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>RELEASE</version> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-core</artifactId> <version>2.8.2</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.6.0</version> </dependency> </dependencies> ``` *`resources/log4j.properties`* ```xml log4j.rootLogger=INFO, stdout log4j.appender.stdout=org.apache.log4j.ConsoleAppender log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n log4j.appender.logfile=org.apache.log4j.FileAppender log4j.appender.logfile.File=target/spring.log log4j.appender.logfile.layout=org.apache.log4j.PatternLayout log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n ``` 3. Java程序 *`com/exa/hdfs001/HdfsClient.java`* ```java package com.exa.hdfs001; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; import org.junit.Test; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; public class HdfsClient { /** * 创建HDFS文件目录，原目录存在则覆盖 */ @Test public void hdfsMkdir() throws IOException, URISyntaxException, InterruptedException { // 1. 获取文件系统 Configuration configuration = new Configuration(); // 配置在集群上运行 // configuration.set("fs.defaultFS", "hdfs://hadoop101:9000"); // FileSystem fs = FileSystem.get(configuration); /* 客户端去操作 hdfs 时，是有一个用户身份的。默认情况下，hdfs 客户端 api 会从 jvm 中获取一个参数来作为自己的用户身份：-DHADOOP_USER_NAME=root，root 为用户名称。 */ FileSystem fs = FileSystem.get(new URI("hdfs://hadoop101:9000"), configuration, "root"); // 2. 创建目录 fs.mkdirs(new Path("/user/hadoop/input002")); // 3. 关闭资源 fs.close(); } /** * 上传文件到HDFS系统，原文件存在则覆盖 */ @Test public void copyFromLocalFile() throws URISyntaxException, IOException, InterruptedException { // 1. 获取文件系统 Configuration configuration = new Configuration(); // 可以在三个地方设置副本的优先级，从高到低为 Java代码中的设置 -> Java项目根目录下的hdfs-site.xml配置 // -> 服务器中的默认设置 // configuration.set("dfs.replication", "2"); FileSystem fs = FileSystem.get(new URI("hdfs://hadoop101:9000"), configuration, "root"); // 2. 上传文件 fs.copyFromLocalFile(new Path("d:/hello.txt"), new Path("/user/hadoop/input002/hello.txt")); // 3. 关闭资源 fs.close(); } /** * HDFS文件下载到本地，如果原文件存在则覆盖 */ @Test public void copyToLocalFile() throws URISyntaxException, IOException, InterruptedException { // 1. 获取文件系统 Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(new URI("hdfs://hadoop101:9000"), configuration, "root"); // copyToLocalFile(boolean delSrc, Path src, Path dst, boolean useRawLocalFileSystem) // delSrc false不将原文件删除，true将原文件删除 // src 被下载的文件 // dst 将文件下载到哪 // useRawLocalFileSystem true开启文件校验、false不开启文件校验 fs.copyToLocalFile(false, new Path("/user/hadoop/input002/hello.txt"), new Path("d:/hello.txt"), true); fs.close(); } /** * 更改HDFS文件名 */ @Test public void hdfsRename() throws URISyntaxException, IOException, InterruptedException { // 获取文件系统 Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(new URI("hdfs://hadoop101:9000"), configuration, "root"); // 重命名 fs.rename(new Path("/user/hadoop/input002/hello.txt"), new Path("/user/hadoop/input002/hello002.txt")); fs.close(); } /** * HDFS文件详细查询 */ @Test public void hdfsListFiles() throws URISyntaxException, IOException, InterruptedException { // 获取文件系统 Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(new URI("hdfs://hadoop101:9000"), configuration, "root"); // 获取文件详情 RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/"), true); while (listFiles.hasNext()) { LocatedFileStatus status = listFiles.next(); System.out.println("文件名：" + status.getPath().getName()); System.out.println("长度：" + status.getLen()); System.out.println("权限：" + status.getPermission()); System.out.println("所属组：" + status.getGroup()); // 获取块信息 BlockLocation[] blockLocations = status.getBlockLocations(); for (BlockLocation blockLocation : blockLocations) { // 获取块存储的主机节点 String[] hosts = blockLocation.getHosts(); for (String host : hosts) { System.out.println("host：" + host); } } System.out.println("----------------------------------"); } fs.close(); } /** * HDFS文件和文件夹判断 */ @Test public void hdfsListStatus() throws URISyntaxException, IOException, InterruptedException { // 获取文件系统 Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(new URI("hdfs://hadoop101:9000"), configuration, "root"); // 获取 / 目录下的所有子目录的类型，不包括孙子目录 FileStatus[] listStatus = fs.listStatus(new Path("/")); for (FileStatus fileStatus : listStatus) { if (fileStatus.isFile()) { // 是文件 System.out.println("f:" + fileStatus.getPath().getName()); } else { System.out.println("d:" + fileStatus.getPath().getName()); } } fs.close(); } /** * 删除HDFS文件或目录 */ @Test public void hdfsDelete() throws URISyntaxException, IOException, InterruptedException { // 获取文件系统 Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(new URI("hdfs://hadoop101:9000"), configuration, "root"); // 删除 fs.delete(new Path("/user/hadoop/input002"), true); fs.close(); } } ``` 上面讲到的设置副本的优先级的 hdfs-site.xml 配置内容如下： ```xml <?xml version="1.0" encoding="UTF-8"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>dfs.replication</name> <value>1</value> </property> </configuration> ```