Parser.Cpp
Comparing C++ And Perl
#include "Parser.h"
#include <boost/regex.hpp>
#include <iostream>
#include <sstream>
#include <fstream>
#include "MyException.h"
#include "Bibitem.h"
using namespace std;
static bool clean(string& s){
static const boost::regex no_ws("\\s*([-\\/\\d\\w]+)\\s*");
boost::cmatch matches;
if (boost::regex_match(s.c_str(), matches, no_ws)){
string clean_s( matches[1].first,
matches[1].second );
s = clean_s;
return true;
}
return false;
}
void Parser::canonicalize_and_verify_entries(
vector <Bibitem> &entries,
Technologies const &technologies)
{
// for each entry make sure the technology is set and valid
vector <Bibitem>::iterator it;
for (it = entries.begin(); it !=entries.end(); ++it){
// find the technology value
string tech;
if (it->get("technology", tech)){
// ensure than the technology value is valid
int i;
if (!technologies.entry_num(tech, i)){
throw MyException(
"entry has unknown technology information: " +
tech);
}
}
else{
throw MyException(
"entry has no technology information");
}
}
}
void Parser::save(
vector <Bibitem > &entries,
Bibitem ¤t)
{
// only store entries which actually have a title
// and authors
string title;
if (!current.get("title", title)) return;
string authors;
if (!current.get("authors", authors)) return;
// store the entry number
ostringstream oss;
oss << 1+entries.size();
current.put("entry_number", oss.str());
// get the first author
static const boost::regex delim(",");
boost::sregex_token_iterator i(
authors.begin(), authors.end(), delim, -1 ), end;
string first_author(*i);
// save the surname of the first author
static const boost::regex name(".*\\. ([-'\\w]+)\\s*");
boost::cmatch matches;
if (boost::regex_match(first_author.c_str(), matches, name)){
string surname( matches[1].first,
matches[1].second );
current.put("surname", surname);
}
else{
throw MyException("Surname can't be extracted: "+first_author+" for "+title);
}
// copy the bibitem to the entries vector
entries.push_back( current );
}
void Parser::append(
string &s,
string const &tag,
Bibitem ¤t)
{
// dont append blank lines
static const boost::regex ws("\\s*");
if (boost::regex_match(s, ws)) return;
// remove whitespace from before & after s
clean(s);
// only append to an existing tag
string s0 = "";
current.get(tag, s0);
if (s0.length()>0){
s = s0 + "\n" + s;
}
current.put(tag, s);
}
void Parser::parse_data_file(
Tags const &tags,
vector <Bibitem> &entries
)
{
static const boost::regex delim("@@");
string current_tag;
Bibitem current;
string s;
while (cin){
getline(cin, s);
boost::sregex_token_iterator i( s.begin(), s.end(), delim, -1 ), end;
int c=0;
for (; i != end; ++i){
string dat(*i);
if (c++ == 0){
append( dat, current_tag, current );
}
else{
static const boost::regex poss_tag("([-\\w]+)\\s*=?\\s*(.*)");
boost::cmatch matches;
if (boost::regex_match(dat.c_str(), matches, poss_tag)){
string tag( matches[1].first,
matches[1].second );
string val( matches[2].first,
matches[2].second );
if (tags.exists(tag)){
current_tag = tag;
append ( val, current_tag, current );
}
else{
// check if we are just moving on to the next
// item
static const boost::regex ws("-+");
if (boost::regex_match(dat, ws)){
save( entries, current );
current.clear();
}
else{
throw MyException("no such tag: "+tag);
}
}
}
else{
throw MyException("bad format: "+dat);
}
}
}
}
}
void Parser::print_technologies_table(
Technologies const &technologies,
Headings const &headings)
{
cout << "<ul class='arrow'>" << endl;
for (int i = 0; i < technologies.size(); ++i){
cout << "<li><a href='#" << technologies[i] << "'>";
string tech_heading;
headings.get(technologies[i], tech_heading);
cout << tech_heading << "</a>" << endl;
}
cout << "<li><a href='#Bibliography'>Bibliography</a>" << endl;
cout << "</ul>" << endl << endl;
}
void Parser::print_data(
vector <Bibitem> &entries,
Technologies const &technologies,
Headings const &headings)
{
// sort the entries with the technology ordering
Bibitem::set_tech(&technologies);
Bibitem::set_order(BO_TECHNOLOGY_ORDER);
sort(entries.begin(), entries.end());
// print the first technology
string present_tech;
entries[0].get("technology", present_tech);
cout << "<a name='" << present_tech << "'></a>" << endl;
string tech_heading;
headings.get(present_tech, tech_heading);
cout << "<p class='hdr1'>" << tech_heading << "</p>" << endl << endl;
// print the first year
string present_year;
int present_year_int;
entries[0].get("year", present_year);
istringstream present_year_stream(present_year);
present_year_stream >> present_year_int;
cout << "<p class='hdr2'>"
<< present_year_int << "</p>" << endl << endl;
for (int i = 0; i < entries.size(); i++){
// if we have a new technology then print it
string new_tech;
entries[i].get("technology", new_tech);
if (new_tech != present_tech) {
present_tech = new_tech;
cout << "<a name='" << present_tech << "'></a>" << endl;
string tech_heading;
headings.get(present_tech, tech_heading);
cout << "<p class='hdr1'>"
<< tech_heading << "</p>" << endl << endl;
}
// if we have a new year then print it
string new_year;
int new_year_int;
entries[i].get("year", new_year);
istringstream new_year_stream(new_year);
new_year_stream >> new_year_int;
if (new_year_int != present_year_int){
present_year_int = new_year_int;
cout << "<p class='hdr2'>"
<< present_year_int << "</p>" << endl << endl;
}
// print the author information
string authors;
if (entries[i].get("authors", authors)){
cout << "<strong>" << authors << "</strong>" << endl;
} else {
}
// print the title information (linked if necessary}
string title, title_link;
string pre, post;
pre = "";
post = "";
if (entries[i].get("title_link", title_link)){
pre = "<a href='"+title_link+"'>";
post = "</a>";
}
entries[i].get("title", title);
cout << pre << title << post << endl;
// print the description if present, otherwise the abstract
string description;
if (entries[i].get("description", description)){
cout << description << endl;
}
else{
string abstract;
if (entries[i].get("abstract", abstract)){
cout << abstract << endl;
}
}
// print a newline between entries
cout << "<p>" << endl;
}
}
void Parser::print_bib(
vector <Bibitem> &entries)
{
// sort the entries with the technology ordering
Bibitem::set_order(BO_AUTHOR_ORDER);
sort(entries.begin(), entries.end());
cout << "<a name='Bibliography'></a>" << endl;
cout << "<p class='hdr1'>Bibliography</p>" << endl;
for (int i = 0; i < entries.size(); i++){
string authors, title, year, conference,
location, series, vol, publisher, workshop,
eprint_ref, tech_report;
entries[i].get("year", year);
// Print the author names
cout << "<p>" << endl;
if (entries[i].get("authors", authors)){
cout << authors << ", ";
}
// Print the paper title
if (entries[i].get("title", title)){
cout << title << ", ";
}
// Print conference info correctly
if (entries[i].get("conference", conference)){
if (conference == "CRYPTO" ||
conference == "EUROCRYPT" ||
conference == "ASIACRYPT"){
cout << "Advances in Cryptology - "
<< conference
<< " " << year << ", ";
}
else{
cout << conference << " " << year << ", ";
}
if (entries[i].get("location", location)){
cout << location << ", ";
}
if (entries[i].get("series", series) &&
entries[i].get("vol", vol)){
cout << series << " vol. " << vol << ", ";
}
if (entries[i].get("publisher", publisher)){
cout << publisher << ", ";
}
}
// Print workshop info correctly
if (entries[i].get("workshop", workshop)){
cout << workshop << " " << year << ", ";
if (entries[i].get("location", location)){
cout << location << ", ";
}
// there may not be workshop proceedings (in which case
// this info wont be printed)
if (entries[i].get("series", series) &&
entries[i].get("vol", vol)){
cout << series << " vol. " << vol << ", ";
}
if (entries[i].get("publisher", publisher)){
cout << publisher << ", ";
}
}
// Print eprint submissions correctly
if (entries[i].get("eprint_ref", eprint_ref)){
cout << "Cryptology ePrint Archive, ref. " << eprint_ref << ", ";
}
// Print eprint submissions correctly
if (entries[i].get("tech_report", tech_report)){
cout << "NTRU Technical Report #" << tech_report << ", ";
}
// Always end with the year
cout << year << "." << endl;
}
}
Wednesday, April 29, 2009 (1)