Skip to content

Commit

Permalink
Summarizer as separate class
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyaGusev committed Sep 13, 2020
1 parent 2d0159f commit e3af2e1
Show file tree
Hide file tree
Showing 14 changed files with 93 additions and 85 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ set(SOURCE_FILES
src/rank.cpp
src/run_server.cpp
src/server_clustering.cpp
src/summarizer.cpp
src/thread_pool.cpp
src/util.cpp
)
Expand Down
2 changes: 0 additions & 2 deletions configs/clusterer.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,3 @@ clusterings: [
}
]
iter_timestamp_percentile: 0.99
hosts_rating: "models/pagerank_rating.txt"
alexa_rating: "models/alexa_rating_4_fixed.txt"
2 changes: 2 additions & 0 deletions configs/summarizer.pbtxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
hosts_rating: "models/pagerank_rating.txt"
alexa_rating: "models/alexa_rating_4_fixed.txt"
16 changes: 3 additions & 13 deletions src/annotator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,10 @@
#include "util.h"

#include <boost/algorithm/string/join.hpp>
#include <google/protobuf/text_format.h>
#include <google/protobuf/io/zero_copy_stream_impl.h>
#include <fcntl.h>
#include <optional>
#include <tinyxml2/tinyxml2.h>

#include <optional>

static std::unique_ptr<TEmbedder> LoadEmbedder(tg::TEmbedderConfig config) {
if (config.type() == tg::ET_FASTTEXT) {
return std::make_unique<TFastTextEmbedder>(config);
Expand All @@ -38,7 +36,7 @@ TAnnotator::TAnnotator(
, SaveNotNews(saveNotNews)
, Mode(mode)
{
ParseConfig(configPath);
::ParseConfig(configPath, Config);
SaveTexts = Config.save_texts() || (Mode == "json");
ComputeNasty = Config.compute_nasty();

Expand Down Expand Up @@ -224,11 +222,3 @@ std::string TAnnotator::PreprocessText(const std::string& text) const {
Tokenizer.tokenize(text, tokens);
return boost::join(tokens, " ");
}

void TAnnotator::ParseConfig(const std::string& fname) {
const int fileDesc = open(fname.c_str(), O_RDONLY);
ENSURE(fileDesc >= 0, "Could not open config file");
google::protobuf::io::FileInputStream fileInput(fileDesc);
const bool success = google::protobuf::TextFormat::Parse(&fileInput, &Config);
ENSURE(success, "Invalid prototxt file");
}
2 changes: 0 additions & 2 deletions src/annotator.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ class TAnnotator {

std::string PreprocessText(const std::string& text) const;

void ParseConfig(const std::string& fname);

private:
tg::TAnnotatorConfig Config;

Expand Down
34 changes: 2 additions & 32 deletions src/clusterer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@
#include "clustering/slink.h"
#include "util.h"

#include <google/protobuf/text_format.h>
#include <google/protobuf/io/zero_copy_stream_impl.h>
#include <fcntl.h>
#include <iostream>


Expand All @@ -22,22 +19,11 @@ uint64_t GetIterTimestamp(const std::vector<TDbDocument>& documents, double perc
return documents[index].FetchTime;
}

TClusterer::TClusterer(const std::string& configPath, bool isSummarizing)
: IsSummarizing(isSummarizing)
{
ParseConfig(configPath);
TClusterer::TClusterer(const std::string& configPath) {
::ParseConfig(configPath, Config);
for (const tg::TClusteringConfig& config: Config.clusterings()) {
Clusterings[config.language()] = std::make_unique<TSlinkClustering>(config);
}
// Load agency ratings
LOG_DEBUG("Loading agency ratings...");
AgencyRating.Load(Config.hosts_rating());
LOG_DEBUG("Agency ratings loaded");

// Load alexa agency ratings
LOG_DEBUG("Loading alexa agency ratings...");
AlexaAgencyRating.Load(Config.alexa_rating());
LOG_DEBUG("Alexa agency ratings loaded");
}

TClusterIndex TClusterer::Cluster(std::vector<TDbDocument>&& docs) const {
Expand Down Expand Up @@ -69,14 +55,6 @@ TClusterIndex TClusterer::Cluster(std::vector<TDbDocument>&& docs) const {

for (const auto& [language, clustering] : Clusterings) {
TClusters langClusters = clustering->Cluster(lang2Docs[language]);
if (IsSummarizing) {
for (TNewsCluster& cluster: langClusters) {
assert(cluster.GetSize() > 0);
cluster.Summarize(AgencyRating);
cluster.CalcImportance(AlexaAgencyRating);
cluster.CalcCategory();
}
}
std::stable_sort(
langClusters.begin(),
langClusters.end(),
Expand All @@ -89,11 +67,3 @@ TClusterIndex TClusterer::Cluster(std::vector<TDbDocument>&& docs) const {
return clusterIndex;
}


void TClusterer::ParseConfig(const std::string& fname) {
const int fileDesc = open(fname.c_str(), O_RDONLY);
ENSURE(fileDesc >= 0, "Could not open config file");
google::protobuf::io::FileInputStream fileInput(fileDesc);
const bool success = google::protobuf::TextFormat::Parse(&fileInput, &Config);
ENSURE(success, "Invalid prototxt file");
}
6 changes: 1 addition & 5 deletions src/clusterer.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,15 @@ struct TClusterIndex {

class TClusterer {
public:
TClusterer(const std::string& configPath, bool isSummarizing);
TClusterer(const std::string& configPath);

TClusterIndex Cluster(std::vector<TDbDocument>&& docs) const;

private:
void Summarize(TClusters& clusters) const;
void CalcWeights(TClusters& clusters) const;
void ParseConfig(const std::string& fname);

private:
tg::TClustererConfig Config;
std::unordered_map<tg::ELanguage, std::unique_ptr<TClustering>> Clusterings;
TAgencyRating AgencyRating;
TAlexaAgencyRating AlexaAgencyRating;
bool IsSummarizing;
};
31 changes: 14 additions & 17 deletions src/clustering/slink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,39 +179,36 @@ std::vector<size_t> TSlinkClustering::ClusterBatch(
nn[i] = minJ;
}

// Cluster meta
std::vector<size_t> clusterSizes(docSize);
std::vector<TClusterSiteNames> clusterSiteNames(docSize);

auto it = begin;
for (size_t i = 0; i < docSize; i++) {
clusterSizes[i] = 1;

if (Config.ban_same_hosts()) {
clusterSiteNames[i].insert((begin + i)->SiteName);
clusterSiteNames[i].insert(it->SiteName);
++it;
}
}
assert(!Config.ban_same_hosts() || it == end);

// Main linking loop
for (size_t level = 0; level + 1 < docSize; ++level) {
// Calculate minimal distance
auto minDistanceIt = std::min_element(nnDistances.begin(), nnDistances.end());
size_t minI = std::distance(nnDistances.begin(), minDistanceIt);
size_t minJ = nn[minI];
float minDistance = *minDistanceIt;

const size_t firstClusterSize = clusterSizes[minI];
const size_t secondClusterSize = clusterSizes[minJ];

TClusterSiteNames* firstClusterSiteNames = Config.ban_same_hosts() ? &clusterSiteNames[minI] : nullptr;
TClusterSiteNames* secondClusterSiteNames = Config.ban_same_hosts() ? &clusterSiteNames[minJ] : nullptr;

const size_t minI = std::distance(nnDistances.begin(), minDistanceIt);
const size_t minJ = nn[minI];
const float minDistance = *minDistanceIt;
if (minDistance > Config.small_threshold()) {
break;
}

const size_t firstClusterSize = clusterSizes[minI];
const size_t secondClusterSize = clusterSizes[minJ];
const size_t newClusterSize = firstClusterSize + secondClusterSize;
if (!IsNewClusterSizeAcceptable(newClusterSize, minDistance, Config)
|| (Config.ban_same_hosts() && HasSameSource(*firstClusterSiteNames, *secondClusterSiteNames))
) {
const bool isAcceptableSize = IsNewClusterSizeAcceptable(newClusterSize, minDistance, Config);
const bool hasSameSource = Config.ban_same_hosts() && HasSameSource(clusterSiteNames[minI], clusterSiteNames[minJ]);
if (!isAcceptableSize || hasSameSource) {
nnDistances[minI] = INF_DISTANCE;
nnDistances[minJ] = INF_DISTANCE;
continue;
Expand All @@ -232,7 +229,7 @@ std::vector<size_t> TSlinkClustering::ClusterBatch(
clusterSizes[minI] = newClusterSize;
clusterSizes[minJ] = newClusterSize;
if (Config.ban_same_hosts()) {
firstClusterSiteNames->insert(secondClusterSiteNames->begin(), secondClusterSiteNames->end());
clusterSiteNames[minI].insert(clusterSiteNames[minJ].begin(), clusterSiteNames[minJ].end());
}

// Update distance matrix and nearest neighbors
Expand Down
8 changes: 7 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "clusterer.h"
#include "rank.h"
#include "run_server.h"
#include "summarizer.h"
#include "timer.h"
#include "util.h"

Expand All @@ -20,6 +21,7 @@ int main(int argc, char** argv) {
("server_config", po::value<std::string>()->default_value("configs/server.pbtxt"), "server_config")
("annotator_config", po::value<std::string>()->default_value("configs/annotator.pbtxt"), "annotator_config")
("clusterer_config", po::value<std::string>()->default_value("configs/clusterer.pbtxt"), "clusterer_config")
("summarizer_config", po::value<std::string>()->default_value("configs/summarizer.pbtxt"), "summarizer_config")
("ndocs", po::value<int>()->default_value(-1), "ndocs")
("save_not_news", po::bool_switch()->default_value(false), "save_not_news")
("languages", po::value<std::vector<std::string>>()->multitoken()->default_value(std::vector<std::string>{"ru", "en"}, "ru en"), "languages")
Expand Down Expand Up @@ -178,7 +180,7 @@ int main(int argc, char** argv) {

// Clustering
const std::string clustererConfigPath = vm["clusterer_config"].as<std::string>();
TClusterer clusterer(clustererConfigPath, mode != "threads");
TClusterer clusterer(clustererConfigPath);
TTimer<std::chrono::high_resolution_clock, std::chrono::milliseconds> clusteringTimer;
TClusterIndex clusterIndex = clusterer.Cluster(std::move(docs));
LOG_DEBUG("Clustering: " << clusteringTimer.Elapsed() << " ms")
Expand Down Expand Up @@ -216,11 +218,15 @@ int main(int argc, char** argv) {
// Ranking
uint64_t window = vm["window_size"].as<uint64_t>();
bool printTopDebugInfo = vm["print_top_debug_info"].as<bool>();
const std::string summarizerConfigPath = vm["summarizer_config"].as<std::string>();
const TSummarizer summarizer(summarizerConfigPath);

TClusters allClusters;
for (const auto& language: {tg::LN_EN, tg::LN_RU}) {
if (clusterIndex.Clusters.find(language) == clusterIndex.Clusters.end()) {
continue;
}
summarizer.Summarize(clusterIndex.Clusters.at(language));
std::copy(
clusterIndex.Clusters.at(language).cbegin(),
clusterIndex.Clusters.at(language).cend(),
Expand Down
4 changes: 3 additions & 1 deletion src/proto/config.proto
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@ message TClusteringConfig {
message TClustererConfig {
repeated TClusteringConfig clusterings = 1;
float iter_timestamp_percentile = 2;
}

message TSummarizerConfig {
string hosts_rating = 3;
string alexa_rating = 4;
}

14 changes: 2 additions & 12 deletions src/run_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,8 @@
#include "server_clustering.h"
#include "util.h"

#include <google/protobuf/text_format.h>
#include <google/protobuf/io/zero_copy_stream_impl.h>
#include <rocksdb/db.h>

#include <fcntl.h>
#include <iostream>
#include <sys/resource.h>

Expand All @@ -21,15 +18,8 @@ using namespace drogon;
namespace {

tg::TServerConfig ParseConfig(const std::string& fname) {
const int fileDesc = open(fname.c_str(), O_RDONLY);
ENSURE(fileDesc >= 0, "Could not open config file");

google::protobuf::io::FileInputStream fileInput(fileDesc);

tg::TServerConfig config;
const bool succes = google::protobuf::TextFormat::Parse(&fileInput, &config);
ENSURE(succes, "Invalid prototxt file");

::ParseConfig(fname, config);
return config;
}

Expand Down Expand Up @@ -97,7 +87,7 @@ int RunServer(const std::string& fname, uint16_t port) {
std::unique_ptr<TAnnotator> annotator = std::make_unique<TAnnotator>(config.annotator_config_path(), languages);

LOG_DEBUG("Creating clusterer");
std::unique_ptr<TClusterer> clusterer = std::make_unique<TClusterer>(config.clusterer_config_path(), true);
std::unique_ptr<TClusterer> clusterer = std::make_unique<TClusterer>(config.clusterer_config_path());

TServerClustering serverClustering(std::move(clusterer), db.get());

Expand Down
29 changes: 29 additions & 0 deletions src/summarizer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include "summarizer.h"

#include "util.h"

TSummarizer::TSummarizer(const std::string& configPath) {
::ParseConfig(configPath, Config);

// Load agency ratings
LOG_DEBUG("Loading agency ratings...");
AgencyRating.Load(Config.hosts_rating());
LOG_DEBUG("Agency ratings loaded");

// Load alexa agency ratings
LOG_DEBUG("Loading alexa agency ratings...");
AlexaAgencyRating.Load(Config.alexa_rating());
LOG_DEBUG("Alexa agency ratings loaded");

}

void TSummarizer::Summarize(TClusters& clusters) const {
for (TNewsCluster& cluster: clusters) {
assert(cluster.GetSize() > 0);
cluster.Summarize(AgencyRating);
cluster.CalcImportance(AlexaAgencyRating);
cluster.CalcCategory();
}
}


17 changes: 17 additions & 0 deletions src/summarizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#pragma once

#include "agency_rating.h"
#include "cluster.h"
#include "config.pb.h"

class TSummarizer {
public:
TSummarizer(const std::string& configPath);

void Summarize(TClusters& clusters) const;

private:
tg::TSummarizerConfig Config;
TAgencyRating AgencyRating;
TAlexaAgencyRating AlexaAgencyRating;
};
12 changes: 12 additions & 0 deletions src/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

#include "enum.pb.h"

#include <fcntl.h>
#include <google/protobuf/text_format.h>
#include <google/protobuf/io/zero_copy_stream_impl.h>
#include <nlohmann_json/json.hpp>

#include <string>
Expand Down Expand Up @@ -75,3 +78,12 @@ double Sigmoid(double x);

// ISO 8601 with timezone date to timestamp
uint64_t DateToTimestamp(const std::string& date);

template <class TConfig>
void ParseConfig(const std::string& fname, TConfig& config) {
const int fileDesc = open(fname.c_str(), O_RDONLY);
ENSURE(fileDesc >= 0, "Could not open config file");
google::protobuf::io::FileInputStream fileInput(fileDesc);
const bool success = google::protobuf::TextFormat::Parse(&fileInput, &config);
ENSURE(success, "Invalid prototxt file");
}

0 comments on commit e3af2e1

Please sign in to comment.