dex-oat-vdex文件解析实现

前言

通过阅读8.0源码下dex解析的过程，自己实现相关文件解析器。

dex

系统的dex解析头文件
https://github.com/imbaya2466/art_read/blob/master/art/runtime/dex_file.h

文件格式

可以参考前文
https://xn--74q78i15hxv3arigm4e.cn/2018/04/08/dex%E6%96%87%E4%BB%B6%E6%A0%BC%E5%BC%8F/

官方:
https://source.android.com/devices/tech/dalvik/dex-format

细节方面直接参考官方即可，非常全面

header  文件头
ids*5   索引区
class_defs  类定义区
data        数据区----一般直接到文件末尾了
  link_data   连接数据区--位于数据区

// Raw header_item.
struct Header {
  uint8_t magic_[8];   //魔数
  uint32_t checksum_;  //dex文件校验和
  uint8_t signature_[kSha1DigestSize]; //dexSHA-1签名 
  uint32_t file_size_;  // 整个文件大小
  uint32_t header_size_;  // 头大小
  uint32_t endian_tag_;   //大小端标签 ，标准 .dex 文件格式为 小端 ，此项一般固定为 0x1234 5678 常量
  uint32_t link_size_;  // unused   这个两个字段是表示链接数据的大小和偏移值，没用
  uint32_t link_off_;  // unused
  uint32_t map_off_;  // unused   map item 的偏移地址 ，该 item 属于 data 区里的内容 
  uint32_t string_ids_size_;  // dex中用到的所有的字符串内容的项数和偏移值
  uint32_t string_ids_off_;  
  uint32_t type_ids_size_;  // dex中的类型数据结构的项数和偏移值 数量<65535
  uint32_t type_ids_off_;  
  uint32_t proto_ids_size_;  // dex中的元数据信息数据结构的项数和偏移值 数量<65535
  uint32_t proto_ids_off_;  
  uint32_t field_ids_size_;  // dex中的字段信息数据结构的项数和偏移值
  uint32_t field_ids_off_;  
  uint32_t method_ids_size_;  // dex中的方法信息数据结构的项数和偏移值
  uint32_t method_ids_off_;  
  uint32_t class_defs_size_;  // dex中的类信息数据结构的项数和偏移值
  uint32_t class_defs_off_;  
  uint32_t data_size_;  // dex中数据区域的结构信息的大小(字节数)和偏移值
  uint32_t data_off_;  
};

只有data的size表示字节数，data的size+off应该为文件末尾。ids都是索引，def是定义

索引区

string_ids

每项为一个结构体，描述偏移用的，位于索引区

// Raw string_id_item.
struct StringId {
  uint32_t string_data_off_;  // 字符数据的偏移，指向data区
};

每个偏移指向一个数据结构体

在data区的string数据结构体

struct string_data_item
    {
	    uleb128 utf16_size;//uleb128编码长度
      byte data;
    }

数据使用MUTF-8编码，在MTUF-8中，它的头部存放的是由uleb128编码的字符的个数。描述字串的。
leb128格式，是基于 1 个 Byte 的一种不定长度的编码方式。若第一个 Byte 的最高位为 1 ，则表示还需要下一个 Byte 来描述，直至最后一个 Byte 的最高位为 0。即计算一个leb128编码的字符字节数要从开始到最高位为0的字节。这是描述一个字符的。
将leb128编码的数字转换为可读数字的规则是：除去每个字节的最高位，将每个字节剩余的7个bits拼接在一起，即为数字。1-5个字节。

type_ids

这个数据结构中存放的数据主要是描述dex中所有的类型，比如类类型，基本类型，数组类型的名字

每项为一个结构体，描述字串索引，位于索引区

// Raw type_id_item.
struct TypeId {
  dex::StringIndex descriptor_idx_;  // string_ids的标号
};

4字节描述类型的字串标号

字串表示类型还是使用:V-void B-byte C-char D-double F-float I-int J-long 类类型:LA/B/V; 数组：[类型- 类型[]

proto_ids

proto 的意思是 method prototype 代表 java 语言里的一个 method 的原型

// Raw proto_id_item.
struct ProtoId {
  dex::StringIndex shorty_idx_;     // string_id的索引，简单描述方法原型  如DLL  大小4字节
  dex::TypeIndex return_type_idx_;  // type_ids的索引，描述返回值类型  大小俩字节
  uint16_t pad_;                    // padding = 0  填充对齐用        俩字节
  uint32_t parameters_off_;         // type_list的文件偏移
};

简单表示原型字串索引、返回类型类型索引、参数类型表偏移，偏移为0时没有参数

参数类型表，位于data区

// Raw type_list.
class TypeList {
  uint32_t size_;  // list的大小
  TypeItem list_[1];  //每项内容是type_ids的索引-俩字节
};

field_ids

描述类属性成员

// Raw field_id_item.
struct FieldId {
  dex::TypeIndex class_idx_;   // type_ids_的索引，表示成员所属类
  dex::TypeIndex type_idx_;    // type_ids_的索引，表示成员本身的类型
  dex::StringIndex name_idx_;  // string_ids_的缩影，表示成员的名字
};

method_ids

描述类的方法

// Raw method_id_item.
struct MethodId {
  dex::TypeIndex class_idx_;   // type_ids_的索引 ，表示方法所属类
  uint16_t proto_idx_;         // proto_ids_的索引，表示方法原
  dex::StringIndex name_idx_;  // string_ids_的索引，表示方法名
};

类定义区

存放class的定义

// Raw class_def_item.
struct ClassDef {
  dex::TypeIndex class_idx_;  // type_ids_的索引，表示那个类
  uint16_t pad1_;  // padding = 0  填充
  uint32_t access_flags_;   //class的访问类型 public , final , static 等
  dex::TypeIndex superclass_idx_;  // type_ids_的索引，表示父类类型
  uint16_t pad2_;  // padding = 0 填充
  uint32_t interfaces_off_;  // 文件偏移，指向 type_list，表示类继承的接口，接口也是类型
  dex::StringIndex source_file_idx_;  // string_ids_的索引，源文件信息 没有为0xffff ffff 
  uint32_t annotations_off_;  // 文件偏移，指向annotations_directory_item 表示注释 没有为0 
  uint32_t class_data_off_;  // 文件偏移，指向class_data_item 表示class的数据 没有为0
  uint32_t static_values_off_;  // 文件偏移，指向EncodedArray，static的初始值 没有为0
};

这里文件的偏移一般都指向了data区

annotationsoff

struct DexAnnotationsDirectoryItem
{
    uint class_annotations_off;        //-->annotation_set_item
    uint fields_size;
    uint annotated_methods_size;
    uint annotated_parameters_size;
    
    field_annotation field_annotations[fields_size];
    method_annotation method_annotations[annotated_methods_size];
    parameter_annotation parameter_annotations[annotated_parameters_size];
}

class_dataoff

		struct class_data_item
		{
			uleb128 static_fields_size;   //静态字段个数
			uleb128 instance_fields_size; //实例字段个数
			uleb128 direct_methods_size;  //直接方法个数
			uleb128 virtual_methods_size; //虚方法个数
      //以下为数组，非偏移。大小由上提供
			encoded_field static_fields [ static_fields_size ];    //静态字段
			encoded_field instance_fields [ instance_fields_size ]; //实例字段
			encoded_method direct_methods [ direct_method_size ];   //直接方法
			encoded_method virtual_methods [ virtual_methods_size ];  //虚方法
    }
struct encoded_field
{
    uleb128 filed_idx_diff; //field_ids的索引，相对此数组上一个的差
    uleb128 access_flags;  //访问权限
}
struct encoded_method
{
    uleb128 method_idx_diff; //method_ids的索引，相对此数组上一个的差
    uleb128 access_flags;     //访问权限
    uleb128 code_off;       //指向data区的偏移指向code_item
}

直接方法是指类的（type为某个类）所有实例构造器和private实例方法。反之protected或者public方法都叫做虚方法。

struct code_item 
{
    ushort                         registers_size;  //此代码使用的寄存器数量
    ushort                         ins_size;        //此代码所用方法的传入参数的字数
    ushort                         outs_size;       //此代码进行方法调用所需的传出参数空间的字数
    ushort                         tries_size;      //此实例的 try_item 数量
    uint                         debug_info_off;    //从文件开头到此代码的调试信息（行号 + 局部变量信息）序列的偏移量
    uint                         insns_size;        //指令列表的大小
    ushort                         insns [insns_size]; //字节码的实际数组
    ushort                         paddding;             //填充用
    try_item                     tries [tyies_size]; // 用于表示在代码中捕获异常的位置以及如何对异常进行处理的数组
    encoded_catch_handler_list  handlers;             // 用于表示“捕获类型列表和关联处理程序地址”的列表的字节。每个 try_item 都具有到此结构的分组偏移量

系统加载

最后都是使用 DexFile::OpenCommon(共同) 来解析加载到内存的dex
其一开始直接使用DexFile构造dex对象

//DexFile类的构造
//参数：数据开始地址、大小、文件名、dex头的check、null
DexFile::DexFile(const uint8_t* base,
                 size_t size,
                 const std::string& location,
                 uint32_t location_checksum,
                 const OatDexFile* oat_dex_file)
    : begin_(base),
      size_(size),
      location_(location),
      location_checksum_(location_checksum),
      header_(reinterpret_cast<const Header*>(base)),
      string_ids_(reinterpret_cast<const StringId*>(base + header_->string_ids_off_)),
      type_ids_(reinterpret_cast<const TypeId*>(base + header_->type_ids_off_)),
      field_ids_(reinterpret_cast<const FieldId*>(base + header_->field_ids_off_)),
      method_ids_(reinterpret_cast<const MethodId*>(base + header_->method_ids_off_)),
      proto_ids_(reinterpret_cast<const ProtoId*>(base + header_->proto_ids_off_)),
      class_defs_(reinterpret_cast<const ClassDef*>(base + header_->class_defs_off_)),
      method_handles_(nullptr),
      num_method_handles_(0),
      call_site_ids_(nullptr),
      num_call_site_ids_(0),
      oat_dex_file_(oat_dex_file) {
...
//从maplist(begin_ + header_->map_off_)初始化节信息
  InitializeSectionsFromMapList();
}

实现解析器

详细细节参考官方
https://source.android.com/devices/tech/dalvik/dex-format#top_of_page

项目仓库:
https://github.com/imbaya2466/XFile

杂记:

dalvik的字节码的以2字节为基础的变长码，见https://source.android.com/devices/tech/dalvik/instruction-formats.html
try-catch块的实现就是直接指定try的区域、catch的列表。相关的数据结构:try_item encoded_catch_handler_list
java的注解提供数据解释程序代码，定义注解只能有属性。使用时在目标位置运用注解即可为目标添加数据。该数据解析可在编译阶段或是运行阶段。编译阶段可使用javac的接口，运行阶段可使用反射。在dex中直接保存注解信息，相关数据结构:annotations_directory_item
类的static字段初始值是直接写在dex类信息中的。相关:static_values_off
一些有趣的java代码翻译为字节码的点(init clinit)，可以阅读阿里技术的深入探索android热修复2.2节

odex

其中包含 APK 中已经过 AOT 编译的方法代码。

本质是一个oat文件，在系统中使用dlopen加载。也可以作为elf文件映射加载。
见下oat文件格式分析

vdex

其中包含 APK 的未压缩 DEX 代码，另外还有一些旨在加快验证速度的元数据。

系统加载

https://github.com/imbaya2466/art_read/blob/master/art/runtime/vdex_file.h
更详细的:
https://github.com/anestisb/vdexExtractor/tree/master/src/vdex

art\runtime\vdex_file.h

explicit VdexFile(MemMap* mmap) : mmap_(mmap) {}
header:
  uint8_t magic_[4];                //魔数vdex
  uint8_t version_[4];              //版本
  uint32_t number_of_dex_files_;    //dex文件的数量
  uint32_t dex_size_;               //文件大小
  uint32_t verifier_deps_size_;
  uint32_t quickening_info_size_;

就是dex的集合

oat

可参考:https://bbs.pediy.com/thread-206230.htm
https://www.jianshu.com/p/e0929379cdc3
老罗:
https://blog.csdn.net/Luoshengyang/article/details/39307813

elf中导出了三个符号oatdata、oatexec和oatlastword，标明oat的位置，oat中又含dex
oat头指明dex文件的偏移、oat化的class信息
oat化的class信息指明了native代码位置