C++编写算法（八）——散列表

2019-05-08 本文已影响0人俊仔系滑翔机

散列表是数组的拓展，能够处理更加复杂的类型的键，需要使用算术运算操作将键转化为数组的索引来访问数组中的键值对。
散列的查找算法分为两步。第一步是用散列函数将被查找的键转化为数组的一个索引。理想情况下，不同的键都能转化为不同的索引值。但实际情况中，存在相同索引对应多个键值对的情况。第二步就是一个处理碰撞的过程。

1、散列表的结构

散列表主要由若干条链表组成，将这些链表的首地址存储起来形成数组。这样的数组-链表组合便是散列表。散列表能够解决搜索数组元素过慢的问题，通过将键值映射为数组索引(通过哈希函数)，快速找到元素所在链表，再通过链表找到数据。将链表个数与链表内的元素个数做一个权衡，搜索的时间将大大减少。同时，采用的链表个数越多，数组所需分配的空间便越大。因此，散列表算法是在时间和空间上做出权衡的经典例子。
散列表将键值映射为索引的算法是哈希算法。大牛们已经给出了一些常见的对象类型哈希算法，最常见的算法是除留余数法。例如，将整型键映射为数组索引可以将整型键除以链表个数M，所得的余数即为数组索引值。同理，字符型键先将字符转换为相应的ASCII码，然后再进行上述操作。其他的对象类型也有其专门的哈希函数，这里不做展开。
散列表需要在时间和空间中进行权衡。链表个数M过小，将表示有大量的键值所求得的索引存在冲突，这将导致链表的长度过大，搜索时间增加。链表个数M过大时，虽然能够大大降低搜索时间，但是存储空间上升，因为需要更多的空间存储键值。如何合理选取M，使得N个元素均匀地分配在每一个链表中，是散列表的目标。

总的来说，要为一个数据类型实现一个优秀的散列方法需要满足三个条件：
①一致性
②高效性
③均匀性

2、解决碰撞的方法

散列表经常碰到的问题是数组索引的碰撞。解决碰撞的方法为拉链法和线性探测法。

a、基于拉链法的散列表

拉链法就是使用原始的链表数据类型来构建散列表。为了简化链表的结构，这里采用了单链表的结构，降低数据结构的复杂度。

#ifndef JUNZALGHEADER_H_
#define JUNZALGHEADER_H_
#include <vector>
// simulate a <char, int> pair hash function
int hash_function(char k, int M);

struct Node
{
    char key;
    int value;
    Node * next;

    Node()
    {
        key = '\0';
        value = -1;
        next = NULL;
    }
};

typedef Node* Item;

class Sequential
{
private:
    enum {MAX = 10};
    Item begin;
    int length;
public:
    Sequential();
    ~Sequential();
    int sfind(char key);
    bool sinsert(Item no);
    bool isempty();
    bool isfull();
    bool sdelete(char key);
    void SeqShow() const;
};

class Scatter_list
{
private:
    int M;        // number of sequentials
    Sequential capacity[100];   // capacity of Sequential
public:
    Scatter_list();
    ~Scatter_list();
    void setM(int m) { M = m; }
    bool slinsert(Item no);
    bool sldelete(char k);
    int slfind(char k);
    int operator[](char k);
    void scShow() const;
};

#endif //JUNZALGHEADER_H_

头文件中声明了单链表数据结构(Sequential)和散列表数据结构(Scatter_list)。链表最大长度为10，散列表的最大长度为100。
对其中的功能进行实现

#include "JunzAlgHeader.h"
#include <cctype>
#include <iostream>
using std::cout;
using std::endl;


int hash_function(char k, int M)
{
    int hash_num = int(k) % M;
    return hash_num;
}

#pragma region Sequential
// constructor
Sequential::Sequential()
{
    length = 0;
    begin = NULL;
}

// deconstructor
Sequential::~Sequential()
{
}

// empty judgement
bool Sequential::isempty()
{
    return length == 0 ? true : false;
}

// full judgement
bool Sequential::isfull()
{
    return length == MAX ? true : false;
}

// find method
int Sequential::sfind(char k)
{
    if (begin == NULL)
        return -1;
    Item currentNode = begin;
    while (currentNode)
    {
        if (currentNode->key == k)
            break;
        else
            currentNode = currentNode->next;
        if (!currentNode)
            sinsert(currentNode);
    }
    return currentNode->value;
}

// delete method
bool Sequential::sdelete(char k)
{
    if (begin == NULL)
        return false;
    Item currentNode = begin;
    Item formerNode = NULL;
    if (currentNode->key == k)
    {
        begin = currentNode->next;
        return true;
    }
    else
    {
        formerNode = currentNode;
        currentNode = currentNode->next;
    }
    while (currentNode)
    {
        if (currentNode->key == k)
        {
            formerNode->next = currentNode->next;
            break;
        }
        else
        {
            formerNode = currentNode;
            currentNode = currentNode->next;
        }
        if (!currentNode)
            break;
    }
    if (currentNode)
        return false;
    else
        return true;
}


// insert method
bool Sequential::sinsert(Item no)
{
    if (begin == NULL)
    {
        begin = no;
        length = length + 1;
        return true;
    }
        
    else
    {
        if (no->key < begin->key)
        {
            no->next = begin;
            begin = no;
            length = length + 1;
        }
        else
        {
            Item currentNode = begin->next;
            Item former = begin;
            while (currentNode)
            {
                if (no->key > currentNode->key)
                {
                    former = currentNode;
                    currentNode = currentNode->next;
                    continue;
                }
                else if (no->key < currentNode->key)
                    break;
                else
                    currentNode->value = no->value;
            }

            no->next = currentNode;
            former->next = no;
            length = length + 1;
        }
        return true;
    }
}

// show method 
void Sequential::SeqShow() const
{
    Item currentNode = begin;
    while (currentNode)
    {
        cout << "(" <<currentNode->key << ": " << currentNode->value << ")";
        currentNode = currentNode->next;
    }
}
#pragma endregion

#pragma region Scatter_list
// constructor
Scatter_list::Scatter_list()
{
    M = 0;
}

// deconstructor
Scatter_list::~Scatter_list()
{
}

// insert method
bool Scatter_list::slinsert(Item no)
{
    int hash = hash_function(no->key, M);
    return capacity[hash].sinsert(no);
}

// find method
int Scatter_list::slfind(char k)
{
    int hash = hash_function(k, M);
    return capacity[hash].sfind(k);
}

// delete method
bool Scatter_list::sldelete(char k)
{
    int hash = hash_function(k, M);
    return capacity[hash].sdelete(k);
}

// overloading []
int Scatter_list::operator[](char k)
{
    return slfind(k);
}

// show method
void Scatter_list::scShow() const
{
    for (int i = 0; i < M; i++)
    {
        cout << "Seq" << i + 1 << ":";
        capacity[i].SeqShow();
        cout << endl;
    }
}
#pragma endregion

验证散列表的各个功能：查找、插入、删除：

#include<iostream>
#include "JunzAlgHeader.h"
using namespace std;
const int NUM = 5;
int main()
{
    Node no[NUM];
    Scatter_list sc;
    sc.setM(3);
    for (int i = 0; i < NUM; i++)
    {
        cout << "insert a key: ";
        cin >> no[i].key;
        cout << "insert a value:";
        cin >> no[i].value;
        sc.slinsert(&no[i]);
    }
    cout << "\'s\' value: ";
    cout << sc['s'] << endl;
    cout << "The scatter list: " << endl;
    sc.scShow();
    cout << endl;
    cout << "after deleting \'a\'" << endl;
    sc.sldelete('a');
    sc.scShow();
    cout << endl;
    system("pause");
    return 0;
}

得到的结果：

散列表结果.png

b、基于线性探测法的散列表

线性探测法表示的散列表不同于拉链法，其存储空间即是能够存储M个元素的数组。数组的索引对应每个键值所求得的哈希值。线性探测法的散列表的插入分为三种情况：
a、数组在相应哈希值下的存储空间为空，即该位置下没有元素，则直接插入相应的键值对。
b、数组在相应哈希值下的存储空间不为空，即该位置下已经存在元素，元素键值不等，则将哈希值自增1，寻找空余的位置，直至找到存储空间为空的位置，插入元素。
c、数组在相应哈希值下的存储空间不为空，即该位置下已经存在元素，元素键值相等，则直接更改元素值即可。
这种搜索容易溢出，因此，当搜索到数组末尾时，则继续从数组头开始搜索，即首尾元素相连。
基于线性探测法的散列表实现如下：

#ifndef JUNZALGHEADER_H_
#define JUNZALGHEADER_H_
#include <vector>
// simulate a <char, int> pair hash function
int hash_function(char k, int M);

struct Node
{
    char key;
    int value;
    Node * next;

    Node()
    {
        key = '\0';
        value = -1;
        next = NULL;
    }
};

typedef Node* Item;

class Scatter_list
{
private:
    int M;
    enum {MAX = 10};
    Item capacity[MAX];
public:
    Scatter_list();
    ~Scatter_list();
    void setM(int n);
    bool slinsert(Item no); //wrong insert !!!
    int slfind(char k);     // wrong find !!!
    bool sldelete(char k);  // wrong delete !!!
    void show() const;
};
#endif //JUNZALGHEADER_H_

#include "JunzAlgHeader.h"
#include <cctype>
#include <iostream>
using std::cout;
using std::endl;


int hash_function(char k, int M)
{
    int hash_num = int(k) % M;
    return hash_num;
}


#pragma region Scatter_list
Scatter_list::Scatter_list()
{
    for (int i = 0; i < MAX; i++)
        capacity[i] = NULL;
}

Scatter_list::~Scatter_list()
{
}

// set M function
void Scatter_list::setM(int n)
{
    M = n;
}

// insert function
bool Scatter_list::slinsert(Item no)
{
    int hash = hash_function(no->key, M);
    while (capacity[hash])
    {
        hash++;
        if (hash >= MAX)
            hash = 0;
    }
    capacity[hash] = no;
    return true;
}

// find function
int Scatter_list::slfind(char k)
{
    int hash = hash_function(k, M);
    while (capacity[hash])
    {
        if (capacity[hash]->key == k)
            break;
        else
        {
            hash++;
            if (hash >= MAX)
                hash = 0;
        }
    }
    if (capacity[hash] == NULL)
        return -1;
    else
        return capacity[hash]->value;
}

// delete function
bool Scatter_list::sldelete(char k)
{
    int tmp_hash;
    int hash = hash_function(k, M);
    std::vector<Item> temp;
    while (capacity[hash])
    {
        if (capacity[hash]->key == k)
            break;
        else
        {
            hash++;
            if (hash >= MAX)
                hash = 0;
        }
    }
    if (!capacity[hash])
        return false;
    else
    {
        tmp_hash = hash + 1;
        for (tmp_hash; capacity[tmp_hash] != NULL; tmp_hash++)
            temp.push_back(capacity[tmp_hash]);
        for (int i = hash; capacity[i] != NULL; i++)
            capacity[i] = NULL;
        for (int j = 0; j < temp.size(); j++)
            slinsert(temp[j]);
        return true;
    }

}

// show function
void Scatter_list::show() const
{
    for (int i = 0; i < MAX; i++)
    {
        if (!capacity[i])
            cout << " (NULL, NULL)";
        else
            cout << " (" << capacity[i]->key << ", " << capacity[i]->value << ")";
    }
}
#pragma endregion

#include<iostream>
#include "JunzAlgHeader.h"
using namespace std;
const int NUM = 5;
int main()
{
    Node no[NUM];
    Scatter_list sc;
    sc.setM(3);
    for (int i = 0; i < NUM; i++)
    {
        cout << "insert a key: ";
        cin >> no[i].key;
        cout << "insert a value:";
        cin >> no[i].value;
        sc.slinsert(&no[i]);
    }
    sc.show();
    cout << "\'a\': " << sc.slfind('a') << endl;
    sc.sldelete('s');
    sc.show();
    cout << endl;
    system("pause");
    return 0;
}

实现结果为：

线性探测法表示散列表(含空存储位置).png

删除操作会比预想的要复杂一些，因为元素时按照键簇存在的，遇到空指针时，便会结束搜索。因此，如果简单地将某元素置为NULL的操作视为删除，那么该元素后的元素将丢失。因此，线性探测法的删除需要将删除元素后的一些元素重新插入存储空间当中。此时元素的位置将会有所调整。
还可以使用调整存储数组大小的方式，保证散列表的使用率永远不会超过1/2。

3、C++中的散列表

C++中有专门的无序关联容器(unordered_set、unordered_multiset、unordered_map和unordered_multimap)使用了键和哈希表，以便能够快速存取数据。键值对可以用pair容器进行存储。

#include <iostream>
#include <unordered_map>

using namespace std;
const int NUM = 5;
typedef pair<char, int> Pair;

int main()
{
    Pair p1[NUM];
    unordered_map<char, int> a;
    for (int i = 0; i < NUM; i++)
    {
        cout << "Input the key: ";
        cin >> p1[i].first;
        cout << "Input the value: ";
        cin >> p1[i].second;
        a.insert(p1[i]);
    }

    for (auto x : a)
        cout << " (" << x.first << ", " << x.second << ") ";
    cout << endl;
    auto it = a.find('s');
    if (it != a.end())
        cout << " (" << it->first << ", " << it->second << ") ";
    system("pause");
    return 0;
}

上述自己写的实现的散列表的基本功能和无序关联容器unordered_map实现是一样的。但采用STL标准库内的容器，有更加强大的功能。