A new datastructure is about to take dlhist place. dlhist is currently implemented as a mixture of an "process cache" that should record what rss items has been processed (that is why the url is used as a unique identifier), but right now it only stores an url if it has been downloaded. A new datastructure that should be "download history" shall be implemented, that will keep track of what title and where it has been downloaded to. this will make it possible to only download an rss title to a location once. Splitting this datastructure into two separated structures is trivial as a "process cache" will threat URL's as a unique identifier and a "download history" will threat the title in an rss item as a unique identifier (and also track it's destinations). This commit does not change any functionality, I just rename this to keep the "dlhist" prefix and source files clear for when implementing the real dlhist.
130 lines
2.3 KiB
C
130 lines
2.3 KiB
C
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
#include <fcntl.h>
|
|
#include <errno.h>
|
|
#include "env.h"
|
|
#include "error.h"
|
|
#include "cconf.h"
|
|
#include "proc-cache.h"
|
|
#include "filter.h"
|
|
#include "http.h"
|
|
#include "rss.h"
|
|
|
|
#define PROC_CACHE_PURGE_INTERVAL (60*60*6) /* 6 hours (in seconds) */
|
|
|
|
static int write_http_file(struct http_file *file, const char *dest) {
|
|
|
|
char path[4096];
|
|
int rc, fd;
|
|
|
|
snprintf(path, sizeof(path), "%s/%s",
|
|
dest, file->filename);
|
|
|
|
fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0664);
|
|
if (fd < 0 && errno != EEXIST) {
|
|
error("failed to write file: %s", path);
|
|
return -1;
|
|
}
|
|
|
|
rc = write(fd, file->data.block, file->data.len);
|
|
close(fd);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static void process_items(rss_t rss, struct target *t) {
|
|
|
|
int i;
|
|
struct rss_item item;
|
|
|
|
while(rss_walk_next(rss, &item)) {
|
|
|
|
struct http_file *file = NULL;
|
|
|
|
if (proc_cache_lookup(item.link))
|
|
continue;
|
|
|
|
for(i=0; i < t->nr; i++) {
|
|
struct filter *filter = &t->filter[i];
|
|
|
|
if (!filter_match(filter->pattern, item.title))
|
|
continue;
|
|
|
|
/* fetch the file if we haven't already. */
|
|
if (file == NULL) {
|
|
file = http_fetch_file(item.link);
|
|
if (file == NULL) {
|
|
error("download failed");
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* save file to disk. */
|
|
if (write_http_file(file, filter->dest) < 0)
|
|
continue;
|
|
|
|
printf("Downloaded: %s (%s) to %s\n",
|
|
item.title, item.link, filter->dest);
|
|
|
|
proc_cache_update(item.link);
|
|
}
|
|
|
|
http_free_file(file);
|
|
}
|
|
}
|
|
|
|
static void process(struct cconf *config) {
|
|
|
|
int i;
|
|
struct buffer *data;
|
|
|
|
proc_cache_purge(PROC_CACHE_PURGE_INTERVAL);
|
|
|
|
for(i=0; i < config->nr; i++) {
|
|
struct target *t = config->target + i;
|
|
rss_t rss;
|
|
|
|
data = http_fetch_page(t->src);
|
|
if (!data)
|
|
continue;
|
|
|
|
rss = rss_parse(data->block, data->len);
|
|
if (!rss) {
|
|
error("failed to parse rss: %s", t->src);
|
|
continue;
|
|
}
|
|
|
|
process_items(rss, t);
|
|
rss_free(rss);
|
|
http_free(data);
|
|
}
|
|
}
|
|
|
|
int main(int argc, char *argv[]) {
|
|
|
|
struct cconf *config;
|
|
char configfile[4096];
|
|
|
|
snprintf(configfile, sizeof(configfile), "%s/%s",
|
|
env_get_dir(), "config");
|
|
|
|
config = cconf_read(configfile);
|
|
if (!config) {
|
|
perror(configfile);
|
|
return 1;
|
|
}
|
|
|
|
if (proc_cache_open() < 0)
|
|
return 1;
|
|
|
|
process(config);
|
|
|
|
proc_cache_close();
|
|
cconf_free(config);
|
|
|
|
return 0;
|
|
}
|