A summary with code samples of the set operations that come with STL: union, intersection, symmetrical difference and set difference.
For consistency, the two sets of integer vectors used in each example are the same and are:
vals1 = { 1, 2, 4 } vals2 = { 1, 2, 5, 7 };
Unions
The union of two sets is formed by the elements that are present in either one of the sets, or in both.
#include <iostream> #include <algorithm> #include <iterator> #include <string> #include <vector> typedef std::vector<int>::iterator iter; template<typename T, typename InputIterator> void Print(std::ostream& ostr, InputIterator itbegin, InputIterator itend, const std::string& delimiter) { std::copy(itbegin, itend, std::ostream_iterator<T>(ostr, delimiter.c_str())); } int main() { // Initialise sample data sets and vectors int vals1[] = { 1, 2, 4 }; int vals2[] = { 1, 2, 5, 7 }; int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] ); int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] ); std::vector<int> v1( vals1, vals1 + size1 ); std::vector<int> v2( vals2, vals2 + size2 ); std::vector<int> v( size1 + size2 ); std::vector<int>::iterator it_union; std::vector<int>::iterator it; // Sort the vectors (essential) std::sort( v1.begin(), v1.end() ); std::sort( v2.begin(), v2.end() ); // Find the union of the two vector sets: elements present in either // of the sets or both it_union = set_union ( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin()); int union_size = int( it_union - v.begin() ); std::cout << "Union size = " << union_size << std::endl; // Print the union Print<int, iter>( std::cout, v.begin(), it_union, " " ); return 0; }
Output:
Union size = 5
1 2 4 5 7
Intersections
Intersections consist of elements present in both sets at the same time.
#include <iostream> #include <algorithm> #include <iterator> #include <string> #include <vector> typedef std::vector<int>::iterator iter; template<typename T, typename InputIterator> void Print(std::ostream& ostr, InputIterator itbegin, InputIterator itend, const std::string& delimiter) { std::copy(itbegin, itend, std::ostream_iterator<T>(ostr, delimiter.c_str())); } int main() { // Initialise sample data sets and vectors int vals1[] = { 1, 2, 4 }; int vals2[] = { 1, 2, 5, 7 }; int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] ); int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] ); std::vector<int> v1( vals1, vals1 + size1 ); std::vector<int> v2( vals2, vals2 + size2 ); std::vector<int> v( size1 + size2 ); std::vector<int>::iterator it_intersect; std::vector<int>::iterator it; // Sort the vectors (essential) std::sort( v1.begin(), v1.end() ); std::sort( v2.begin(), v2.end() ); // Find the intersection of the two vector sets: elements present in both sets it_intersect = set_intersection( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin()); int intersect_size = int( it_intersect - v.begin() ); std::cout << "Intersection size = " << intersect_size << std::endl; // Print the intersection Print<int, iter>( std::cout, v.begin(), it_intersect, " " ); return 0; }
Output:
Intersection size = 2
1 2
Symmetric Differences
The set of elements that are present in one of the sets, but not in the other.
#include <iostream> #include <algorithm> #include <iterator> #include <string> #include <vector> typedef std::vector<int>::iterator iter; template<typename T, typename InputIterator> void Print(std::ostream& ostr, InputIterator itbegin, InputIterator itend, const std::string& delimiter) { std::copy(itbegin, itend, std::ostream_iterator<T>(ostr, delimiter.c_str())); } int main() { // Initialise sample data sets and vectors int vals1[] = { 1, 2, 4 }; int vals2[] = { 1, 2, 5, 7 }; int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] ); int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] ); std::vector<int> v1( vals1, vals1 + size1 ); std::vector<int> v2( vals2, vals2 + size2 ); std::vector<int> v( size1 + size2 ); std::vector<int>::iterator it_symm_diff; std::vector<int>::iterator it; // Sort the vectors (essential) std::sort( v1.begin(), v1.end() ); std::sort( v2.begin(), v2.end() ); // The symmetric difference of two sets is formed by the elements that are present in one // of the sets, but not in the other. it_symm_diff = set_symmetric_difference( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin()); int symm_diff_size = int( it_symm_diff - v.begin() ); std::cout << "Symmetric difference size = " << symm_diff_size << std::endl; // Print the symmetric difference Print<int, iter>( std::cout, v.begin(), it_symm_diff, " " ); return 0; }
Output:
Symmetric difference size = 3
4 5 7
Set Differences
The elements that present in the first set, but not in the second one.
#include <iostream> #include <algorithm> #include <iterator> #include <string> #include <vector> typedef std::vector<int>::iterator iter; template<typename T, typename InputIterator> void Print(std::ostream& ostr, InputIterator itbegin, InputIterator itend, const std::string& delimiter) { std::copy(itbegin, itend, std::ostream_iterator<T>(ostr, delimiter.c_str())); } int main() { // Initialise sample data sets and vectors int vals1[] = { 1, 2, 4 }; int vals2[] = { 1, 2, 5, 7 }; int size1 = sizeof( vals1 ) / sizeof ( vals1[ 0 ] ); int size2 = sizeof( vals2 ) / sizeof ( vals2[ 0 ] ); std::vector<int> v1( vals1, vals1 + size1 ); std::vector<int> v2( vals2, vals2 + size2 ); std::vector<int> v( size1 + size2 ); std::vector<int>::iterator it_set_diff; std::vector<int>::iterator it; // Sort the vectors (essential) std::sort( v1.begin(), v1.end() ); std::sort( v2.begin(), v2.end() ); // The set difference of two sets is formed by the elements that are present in the first // sets, but not in the second. it_set_diff = set_difference( v1.begin(), v1.end(), v2.begin(), v2.end(), v.begin()); int set_diff_size = int( it_set_diff - v.begin() ); std::cout << "Set difference size = " << set_diff_size << std::endl; // Print the set difference Print<int, iter>( std::cout, v.begin(), it_set_diff, " " ); return 0; }
Output:
Set difference size = 1
4
Practical example: using union / set intersection to calculate the Jaccard Index
The Jaccard index is a means of measuring the similarity and/or diversity of sample sets:
#include <iostream> #include <vector> #include <algorithm> #include <iterator> #include <string> class Token { private: std::string str; public: Token() {} Token( std::string val ) : str( val ) {} std::string Word() const { return str; } }; struct Ascending { bool operator() ( Token& start, Token& end ) { return start.Word() < end.Word(); } }; int main() { std::string str1[] = { "The", "crazy", "cat", "sat", "on", "the", "mat" }; std::string str2[] = { "The", "cat", "sat", "on", "the", "red", "mat" }; std::vector<Token>::iterator tok_intersect, tok_union; int size1 = sizeof( str1 ) / sizeof( str1[ 0 ] ); int size2 = sizeof( str2 ) / sizeof( str2[ 0 ] ); std::vector<Token> tokens1( str1, str1 + size1 ); std::vector<Token> tokens2( str2, str2 + size2 ); std::vector<Token> tokens( size1 + size2 ); std::sort( tokens1.begin(), tokens1.end(), Ascending() ); std::sort( tokens2.begin(), tokens2.end(), Ascending() ); tok_intersect = std::set_intersection( tokens1.begin(), tokens1.end(), tokens2.begin(), tokens2.end(), tokens.begin(), Ascending() ); int intersect_size = int( tok_intersect - tokens.begin() ); tok_union = std::set_union ( tokens1.begin(), tokens1.end(), tokens2.begin(), tokens2.end(), tokens.begin(), Ascending() ); int union_size = int( tok_union - tokens.begin() ); double JaccardIndex = (double) intersect_size / (double) union_size; return 0; }
In this example, the size of the union set is 8, since there are 8 words in either one of the sets, or in both: “The”, “crazy”, “cat”, “sat”, “on”, “the”, “red”, “mat”. The intersection size is 6 since there are 6 words present in both sets at the same time: “The”, “cat”, “sat”, “on”, “the”, “mat”. Thus the Jaccard Index is calculated as 6 / 8 = 0.75.